mirror of
https://github.com/lancedb/lancedb.git
synced 2026-03-28 11:30:39 +00:00
When using hybrid search with a where filter, the prefilter argument is silently inverted. Passing prefilter=True actually performs post-filtering, and prefilter=False actually performs pre-filtering.
3510 lines
125 KiB
Python
3510 lines
125 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||
|
||
from __future__ import annotations
|
||
|
||
from abc import ABC, abstractmethod
|
||
from concurrent.futures import ThreadPoolExecutor
|
||
from enum import Enum
|
||
from datetime import timedelta
|
||
from typing import (
|
||
TYPE_CHECKING,
|
||
Dict,
|
||
List,
|
||
Literal,
|
||
Optional,
|
||
Tuple,
|
||
Type,
|
||
TypeVar,
|
||
Union,
|
||
Any,
|
||
)
|
||
|
||
import asyncio
|
||
import deprecation
|
||
import numpy as np
|
||
import pyarrow as pa
|
||
import pyarrow.compute as pc
|
||
import pyarrow.fs as pa_fs
|
||
import pydantic
|
||
|
||
from lancedb.pydantic import PYDANTIC_VERSION
|
||
from lancedb.background_loop import LOOP
|
||
|
||
from . import __version__
|
||
from .arrow import AsyncRecordBatchReader
|
||
from .dependencies import pandas as pd
|
||
from .rerankers.base import Reranker
|
||
from .rerankers.rrf import RRFReranker
|
||
from .rerankers.util import check_reranker_result
|
||
from .util import flatten_columns
|
||
from lancedb._lancedb import fts_query_to_json
|
||
from typing_extensions import Annotated
|
||
|
||
if TYPE_CHECKING:
|
||
import sys
|
||
import PIL
|
||
import polars as pl
|
||
|
||
from ._lancedb import Query as LanceQuery
|
||
from ._lancedb import FTSQuery as LanceFTSQuery
|
||
from ._lancedb import HybridQuery as LanceHybridQuery
|
||
from ._lancedb import VectorQuery as LanceVectorQuery
|
||
from ._lancedb import TakeQuery as LanceTakeQuery
|
||
from ._lancedb import PyQueryRequest
|
||
from .common import VEC
|
||
from .pydantic import LanceModel
|
||
from .table import Table
|
||
|
||
if sys.version_info >= (3, 11):
|
||
from typing import Self
|
||
else:
|
||
from typing_extensions import Self
|
||
|
||
T = TypeVar("T", bound="LanceModel")
|
||
|
||
|
||
# Pydantic validation function for vector queries
|
||
def ensure_vector_query(
|
||
val: Any,
|
||
) -> Union[List[float], List[List[float]], pa.Array, List[pa.Array]]:
|
||
if isinstance(val, list):
|
||
if len(val) == 0:
|
||
return ValueError("Vector query must be a non-empty list")
|
||
sample = val[0]
|
||
else:
|
||
if isinstance(val, float):
|
||
raise ValueError(
|
||
"Vector query must be a list of floats or a list of lists of floats"
|
||
)
|
||
sample = val
|
||
if isinstance(sample, pa.Array):
|
||
# val is array or list of array
|
||
return val
|
||
if isinstance(sample, list):
|
||
if len(sample) == 0:
|
||
return ValueError("Vector query must be a non-empty list")
|
||
if isinstance(sample[0], float):
|
||
# val is list of list of floats
|
||
return val
|
||
if isinstance(sample, float):
|
||
# val is a list of floats
|
||
return val
|
||
|
||
|
||
class FullTextQueryType(str, Enum):
|
||
MATCH = "match"
|
||
MATCH_PHRASE = "match_phrase"
|
||
BOOST = "boost"
|
||
MULTI_MATCH = "multi_match"
|
||
BOOLEAN = "boolean"
|
||
|
||
|
||
class FullTextOperator(str, Enum):
|
||
AND = "AND"
|
||
OR = "OR"
|
||
|
||
|
||
class Occur(str, Enum):
|
||
SHOULD = "SHOULD"
|
||
MUST = "MUST"
|
||
MUST_NOT = "MUST_NOT"
|
||
|
||
|
||
@pydantic.dataclasses.dataclass
|
||
class FullTextQuery(ABC):
|
||
@abstractmethod
|
||
def query_type(self) -> FullTextQueryType:
|
||
"""
|
||
Get the query type of the query.
|
||
|
||
Returns
|
||
-------
|
||
str
|
||
The type of the query.
|
||
"""
|
||
pass
|
||
|
||
def to_json(self) -> str:
|
||
"""
|
||
Convert the query to a JSON string.
|
||
|
||
Returns
|
||
-------
|
||
str
|
||
A JSON string representation of the query.
|
||
|
||
Examples
|
||
--------
|
||
>>> from lancedb.query import MatchQuery
|
||
>>> query = MatchQuery("puppy", "text", fuzziness=2)
|
||
>>> query.to_json()
|
||
'{"match":{"column":"text","terms":"puppy","boost":1.0,"fuzziness":2,"max_expansions":50,"operator":"Or","prefix_length":0}}'
|
||
"""
|
||
return fts_query_to_json(self)
|
||
|
||
def __and__(self, other: "FullTextQuery") -> "FullTextQuery":
|
||
"""
|
||
Combine two queries with a logical AND operation.
|
||
|
||
Parameters
|
||
----------
|
||
other : FullTextQuery
|
||
The other query to combine with.
|
||
|
||
Returns
|
||
-------
|
||
FullTextQuery
|
||
A new query that combines both queries with AND.
|
||
"""
|
||
return BooleanQuery([(Occur.MUST, self), (Occur.MUST, other)])
|
||
|
||
def __or__(self, other: "FullTextQuery") -> "FullTextQuery":
|
||
"""
|
||
Combine two queries with a logical OR operation.
|
||
|
||
Parameters
|
||
----------
|
||
other : FullTextQuery
|
||
The other query to combine with.
|
||
|
||
Returns
|
||
-------
|
||
FullTextQuery
|
||
A new query that combines both queries with OR.
|
||
"""
|
||
return BooleanQuery([(Occur.SHOULD, self), (Occur.SHOULD, other)])
|
||
|
||
|
||
@pydantic.dataclasses.dataclass
|
||
class MatchQuery(FullTextQuery):
|
||
"""
|
||
Match query for full-text search.
|
||
|
||
Parameters
|
||
----------
|
||
query : str
|
||
The query string to match against.
|
||
column : str
|
||
The name of the column to match against.
|
||
boost : float, default 1.0
|
||
The boost factor for the query.
|
||
The score of each matching document is multiplied by this value.
|
||
fuzziness : int, optional
|
||
The maximum edit distance for each term in the match query.
|
||
Defaults to 0 (exact match).
|
||
If None, fuzziness is applied automatically by the rules:
|
||
- 0 for terms with length <= 2
|
||
- 1 for terms with length <= 5
|
||
- 2 for terms with length > 5
|
||
max_expansions : int, optional
|
||
The maximum number of terms to consider for fuzzy matching.
|
||
Defaults to 50.
|
||
operator : FullTextOperator, default OR
|
||
The operator to use for combining the query results.
|
||
Can be either `AND` or `OR`.
|
||
If `AND`, all terms in the query must match.
|
||
If `OR`, at least one term in the query must match.
|
||
prefix_length : int, optional
|
||
The number of beginning characters being unchanged for fuzzy matching.
|
||
This is useful to achieve prefix matching.
|
||
"""
|
||
|
||
query: str
|
||
column: str
|
||
boost: float = pydantic.Field(1.0, kw_only=True)
|
||
fuzziness: int = pydantic.Field(0, kw_only=True)
|
||
max_expansions: int = pydantic.Field(50, kw_only=True)
|
||
operator: FullTextOperator = pydantic.Field(FullTextOperator.OR, kw_only=True)
|
||
prefix_length: int = pydantic.Field(0, kw_only=True)
|
||
|
||
def query_type(self) -> FullTextQueryType:
|
||
return FullTextQueryType.MATCH
|
||
|
||
|
||
@pydantic.dataclasses.dataclass
|
||
class PhraseQuery(FullTextQuery):
|
||
"""
|
||
Phrase query for full-text search.
|
||
|
||
Parameters
|
||
----------
|
||
query : str
|
||
The query string to match against.
|
||
column : str
|
||
The name of the column to match against.
|
||
"""
|
||
|
||
query: str
|
||
column: str
|
||
slop: int = pydantic.Field(0, kw_only=True)
|
||
|
||
def query_type(self) -> FullTextQueryType:
|
||
return FullTextQueryType.MATCH_PHRASE
|
||
|
||
|
||
@pydantic.dataclasses.dataclass
|
||
class BoostQuery(FullTextQuery):
|
||
"""
|
||
Boost query for full-text search.
|
||
|
||
Parameters
|
||
----------
|
||
positive : dict
|
||
The positive query object.
|
||
negative : dict
|
||
The negative query object.
|
||
negative_boost : float, default 0.5
|
||
The boost factor for the negative query.
|
||
"""
|
||
|
||
positive: FullTextQuery
|
||
negative: FullTextQuery
|
||
negative_boost: float = pydantic.Field(0.5, kw_only=True)
|
||
|
||
def query_type(self) -> FullTextQueryType:
|
||
return FullTextQueryType.BOOST
|
||
|
||
|
||
@pydantic.dataclasses.dataclass
|
||
class MultiMatchQuery(FullTextQuery):
|
||
"""
|
||
Multi-match query for full-text search.
|
||
|
||
Parameters
|
||
----------
|
||
query : str | list[Query]
|
||
If a string, the query string to match against.
|
||
columns : list[str]
|
||
The list of columns to match against.
|
||
boosts : list[float], optional
|
||
The list of boost factors for each column. If not provided,
|
||
all columns will have the same boost factor.
|
||
operator : FullTextOperator, default OR
|
||
The operator to use for combining the query results.
|
||
Can be either `AND` or `OR`.
|
||
It would be applied to all columns individually.
|
||
For example, if the operator is `AND`,
|
||
then the query "hello world" is equal to
|
||
`match("hello AND world", column1) OR match("hello AND world", column2)`.
|
||
"""
|
||
|
||
query: str
|
||
columns: list[str]
|
||
boosts: Optional[list[float]] = pydantic.Field(None, kw_only=True)
|
||
operator: FullTextOperator = pydantic.Field(FullTextOperator.OR, kw_only=True)
|
||
|
||
def query_type(self) -> FullTextQueryType:
|
||
return FullTextQueryType.MULTI_MATCH
|
||
|
||
|
||
@pydantic.dataclasses.dataclass
|
||
class BooleanQuery(FullTextQuery):
|
||
"""
|
||
Boolean query for full-text search.
|
||
|
||
Parameters
|
||
----------
|
||
queries : list[tuple(Occur, FullTextQuery)]
|
||
The list of queries with their occurrence requirements.
|
||
Each tuple contains an Occur value (MUST, SHOULD, or MUST_NOT)
|
||
and a FullTextQuery to apply.
|
||
"""
|
||
|
||
queries: list[tuple[Occur, FullTextQuery]]
|
||
|
||
def query_type(self) -> FullTextQueryType:
|
||
return FullTextQueryType.BOOLEAN
|
||
|
||
|
||
class FullTextSearchQuery(pydantic.BaseModel):
|
||
"""A LanceDB Full Text Search Query
|
||
|
||
Attributes
|
||
----------
|
||
columns: List[str]
|
||
The columns to search
|
||
|
||
If None, then the table should select the column automatically.
|
||
query: str | FullTextQuery
|
||
If a string, it is treated as a MatchQuery.
|
||
If a FullTextQuery object, it is used directly.
|
||
"""
|
||
|
||
columns: Optional[List[str]] = None
|
||
query: Union[str, FullTextQuery]
|
||
|
||
|
||
class Query(pydantic.BaseModel):
|
||
"""A LanceDB Query
|
||
|
||
Queries are constructed by the `Table.search` method. This class is a
|
||
python representation of the query. Normally you will not need to interact
|
||
with this class directly. You can build up a query and execute it using
|
||
collection methods such as `to_batches()`, `to_arrow()`, `to_pandas()`,
|
||
etc.
|
||
|
||
However, you can use the `to_query()` method to get the underlying query object.
|
||
This can be useful for serializing a query or using it in a different context.
|
||
|
||
Attributes
|
||
----------
|
||
filter : Optional[str]
|
||
sql filter to refine the query with
|
||
limit : Optional[int]
|
||
The limit on the number of results to return. If this is a vector or FTS query,
|
||
then this is required. If this is a plain SQL query, then this is optional.
|
||
offset: Optional[int]
|
||
The offset to start fetching results from
|
||
|
||
This is ignored for vector / FTS search (will be None).
|
||
columns : Optional[Union[List[str], Dict[str, str]]]
|
||
which columns to return in the results
|
||
|
||
This can be a list of column names or a dictionary. If it is a dictionary,
|
||
then the keys are the column names and the values are sql expressions to
|
||
use to calculate the result.
|
||
|
||
If this is None then all columns are returned. This can be expensive.
|
||
with_row_id : Optional[bool]
|
||
if True then include the row id in the results
|
||
vector : Optional[Union[List[float], List[List[float]], pa.Array, List[pa.Array]]]
|
||
the vector to search for, if this a vector search or hybrid search. It will
|
||
be None for full text search and plain SQL filtering.
|
||
vector_column : Optional[str]
|
||
the name of the vector column to use for vector search
|
||
|
||
If this is None then a default vector column will be used.
|
||
distance_type : Optional[str]
|
||
the distance type to use for vector search
|
||
|
||
This can be l2 (default), cosine and dot. See [metric definitions][search] for
|
||
more details.
|
||
|
||
If this is not a vector search this will be None.
|
||
postfilter : bool
|
||
if True then apply the filter after vector / FTS search. This is ignored for
|
||
plain SQL filtering.
|
||
nprobes : Optional[int]
|
||
The number of IVF partitions to search. If this is None then a default
|
||
number of partitions will be used.
|
||
|
||
- A higher number makes search more accurate but also slower.
|
||
|
||
- See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
||
tuning advice.
|
||
|
||
Will be None if this is not a vector search.
|
||
refine_factor : Optional[int]
|
||
Refine the results by reading extra elements and re-ranking them in memory.
|
||
|
||
- A higher number makes search more accurate but also slower.
|
||
|
||
- See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
||
tuning advice.
|
||
|
||
Will be None if this is not a vector search.
|
||
lower_bound : Optional[float]
|
||
The lower bound for distance search
|
||
|
||
Only results with a distance greater than or equal to this value
|
||
will be returned.
|
||
|
||
This will only be set on vector search.
|
||
upper_bound : Optional[float]
|
||
The upper bound for distance search
|
||
|
||
Only results with a distance less than or equal to this value
|
||
will be returned.
|
||
|
||
This will only be set on vector search.
|
||
ef : Optional[int]
|
||
The size of the nearest neighbor list maintained during HNSW search
|
||
|
||
This will only be set on vector search.
|
||
full_text_query : Optional[Union[str, dict]]
|
||
The full text search query
|
||
|
||
This can be a string or a dictionary. A dictionary will be used to search
|
||
multiple columns. The keys are the column names and the values are the
|
||
search queries.
|
||
|
||
This will only be set on FTS or hybrid queries.
|
||
fast_search: Optional[bool]
|
||
Skip a flat search of unindexed data. This will improve
|
||
search performance but search results will not include unindexed data.
|
||
|
||
The default is False
|
||
"""
|
||
|
||
# The name of the vector column to use for vector search.
|
||
vector_column: Optional[str] = None
|
||
|
||
# vector to search for
|
||
#
|
||
# Note: today this will be floats on the sync path and pa.Array on the async
|
||
# path though in the future we should unify this to pa.Array everywhere
|
||
vector: Annotated[
|
||
Optional[Union[List[float], List[List[float]], pa.Array, List[pa.Array]]],
|
||
ensure_vector_query,
|
||
] = None
|
||
|
||
# sql filter to refine the query with
|
||
filter: Optional[str] = None
|
||
|
||
# if True then apply the filter after vector search
|
||
postfilter: Optional[bool] = None
|
||
|
||
# full text search query
|
||
full_text_query: Optional[FullTextSearchQuery] = None
|
||
|
||
# top k results to return
|
||
limit: Optional[int] = None
|
||
|
||
# distance type to use for vector search
|
||
distance_type: Optional[str] = None
|
||
|
||
# which columns to return in the results
|
||
columns: Optional[Union[List[str], Dict[str, str]]] = None
|
||
|
||
# minimum number of IVF partitions to search
|
||
#
|
||
# If None then a default value (20) will be used.
|
||
minimum_nprobes: Optional[int] = None
|
||
|
||
# maximum number of IVF partitions to search
|
||
#
|
||
# If None then a default value (20) will be used.
|
||
#
|
||
# If 0 then no limit will be applied and all partitions could be searched
|
||
# if needed to satisfy the limit.
|
||
maximum_nprobes: Optional[int] = None
|
||
|
||
# lower bound for distance search
|
||
lower_bound: Optional[float] = None
|
||
|
||
# upper bound for distance search
|
||
upper_bound: Optional[float] = None
|
||
|
||
# multiplier for the number of results to inspect for reranking
|
||
refine_factor: Optional[int] = None
|
||
|
||
# if true, include the row id in the results
|
||
with_row_id: Optional[bool] = None
|
||
|
||
# offset to start fetching results from
|
||
offset: Optional[int] = None
|
||
|
||
# if true, will only search the indexed data
|
||
fast_search: Optional[bool] = None
|
||
|
||
# size of the nearest neighbor list maintained during HNSW search
|
||
ef: Optional[int] = None
|
||
|
||
# Bypass the vector index and use a brute force search
|
||
bypass_vector_index: Optional[bool] = None
|
||
|
||
@classmethod
|
||
def from_inner(cls, req: PyQueryRequest) -> Self:
|
||
query = cls()
|
||
query.limit = req.limit
|
||
query.offset = req.offset
|
||
query.filter = req.filter
|
||
query.full_text_query = req.full_text_search
|
||
query.columns = req.select
|
||
query.with_row_id = req.with_row_id
|
||
query.vector_column = req.column
|
||
query.vector = req.query_vector
|
||
query.distance_type = req.distance_type
|
||
query.minimum_nprobes = req.minimum_nprobes
|
||
query.maximum_nprobes = req.maximum_nprobes
|
||
query.lower_bound = req.lower_bound
|
||
query.upper_bound = req.upper_bound
|
||
query.ef = req.ef
|
||
query.refine_factor = req.refine_factor
|
||
query.bypass_vector_index = req.bypass_vector_index
|
||
query.postfilter = req.postfilter
|
||
if req.full_text_search is not None:
|
||
query.full_text_query = FullTextSearchQuery(
|
||
columns=None,
|
||
query=req.full_text_search,
|
||
)
|
||
return query
|
||
|
||
# This tells pydantic to allow custom types (needed for the `vector` query since
|
||
# pa.Array wouln't be allowed otherwise)
|
||
if PYDANTIC_VERSION.major < 2: # Pydantic 1.x compat
|
||
|
||
class Config:
|
||
arbitrary_types_allowed = True
|
||
else:
|
||
model_config = {"arbitrary_types_allowed": True}
|
||
|
||
|
||
class LanceQueryBuilder(ABC):
|
||
"""An abstract query builder. Subclasses are defined for vector search,
|
||
full text search, hybrid, and plain SQL filtering.
|
||
"""
|
||
|
||
@classmethod
|
||
def create(
|
||
cls,
|
||
table: "Table",
|
||
query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]],
|
||
query_type: str,
|
||
vector_column_name: str,
|
||
ordering_field_name: Optional[str] = None,
|
||
fts_columns: Optional[Union[str, List[str]]] = None,
|
||
fast_search: bool = None,
|
||
) -> Self:
|
||
"""
|
||
Create a query builder based on the given query and query type.
|
||
|
||
Parameters
|
||
----------
|
||
table: Table
|
||
The table to query.
|
||
query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]]
|
||
The query to use. If None, an empty query builder is returned
|
||
which performs simple SQL filtering.
|
||
query_type: str
|
||
The type of query to perform. One of "vector", "fts", "hybrid", or "auto".
|
||
If "auto", the query type is inferred based on the query.
|
||
vector_column_name: str
|
||
The name of the vector column to use for vector search.
|
||
fast_search: bool
|
||
Skip flat search of unindexed data.
|
||
"""
|
||
# Check hybrid search first as it supports empty query pattern
|
||
if query_type == "hybrid":
|
||
# hybrid fts and vector query
|
||
return LanceHybridQueryBuilder(
|
||
table, query, vector_column_name, fts_columns=fts_columns
|
||
)
|
||
|
||
if query is None:
|
||
return LanceEmptyQueryBuilder(table)
|
||
|
||
# remember the string query for reranking purpose
|
||
str_query = query if isinstance(query, str) else None
|
||
|
||
# convert "auto" query_type to "vector", "fts"
|
||
# or "hybrid" and convert the query to vector if needed
|
||
query, query_type = cls._resolve_query(
|
||
table, query, query_type, vector_column_name
|
||
)
|
||
|
||
if query_type == "hybrid":
|
||
return LanceHybridQueryBuilder(
|
||
table, query, vector_column_name, fts_columns=fts_columns
|
||
)
|
||
|
||
if isinstance(query, (str, FullTextQuery)):
|
||
# fts
|
||
return LanceFtsQueryBuilder(
|
||
table,
|
||
query,
|
||
ordering_field_name=ordering_field_name,
|
||
fts_columns=fts_columns,
|
||
fast_search=fast_search,
|
||
)
|
||
|
||
if isinstance(query, list):
|
||
query = np.array(query, dtype=np.float32)
|
||
elif isinstance(query, np.ndarray):
|
||
query = query.astype(np.float32)
|
||
else:
|
||
raise TypeError(f"Unsupported query type: {type(query)}")
|
||
|
||
return LanceVectorQueryBuilder(
|
||
table, query, vector_column_name, str_query, fast_search
|
||
)
|
||
|
||
@classmethod
|
||
def _resolve_query(cls, table, query, query_type, vector_column_name):
|
||
# If query_type is fts, then query must be a string.
|
||
# otherwise raise TypeError
|
||
if query_type == "fts":
|
||
if not isinstance(query, (str, FullTextQuery)):
|
||
raise TypeError(
|
||
f"'fts' query must be a string or FullTextQuery: {type(query)}"
|
||
)
|
||
return query, query_type
|
||
elif query_type == "vector":
|
||
query = cls._query_to_vector(table, query, vector_column_name)
|
||
return query, query_type
|
||
elif query_type == "auto":
|
||
if isinstance(query, (list, np.ndarray)):
|
||
return query, "vector"
|
||
else:
|
||
conf = table.embedding_functions.get(vector_column_name)
|
||
if conf is not None:
|
||
query = conf.function.compute_query_embeddings_with_retry(query)[0]
|
||
return query, "vector"
|
||
else:
|
||
return query, "fts"
|
||
else:
|
||
raise ValueError(
|
||
f"Invalid query_type, must be 'vector', 'fts', or 'auto': {query_type}"
|
||
)
|
||
|
||
@classmethod
|
||
def _query_to_vector(cls, table, query, vector_column_name):
|
||
if isinstance(query, (list, np.ndarray)):
|
||
return query
|
||
conf = table.embedding_functions.get(vector_column_name)
|
||
if conf is not None:
|
||
return conf.function.compute_query_embeddings_with_retry(query)[0]
|
||
else:
|
||
msg = f"No embedding function for {vector_column_name}"
|
||
raise ValueError(msg)
|
||
|
||
def __init__(self, table: "Table"):
|
||
self._table = table
|
||
self._limit = None
|
||
self._offset = None
|
||
self._columns = None
|
||
self._where = None
|
||
self._postfilter = None
|
||
self._with_row_id = None
|
||
self._vector = None
|
||
self._text = None
|
||
self._ef = None
|
||
self._bypass_vector_index = None
|
||
|
||
@deprecation.deprecated(
|
||
deprecated_in="0.3.1",
|
||
removed_in="0.4.0",
|
||
current_version=__version__,
|
||
details="Use to_pandas() instead",
|
||
)
|
||
def to_df(self) -> "pd.DataFrame":
|
||
"""
|
||
*Deprecated alias for `to_pandas()`. Please use `to_pandas()` instead.*
|
||
|
||
Execute the query and return the results as a pandas DataFrame.
|
||
In addition to the selected columns, LanceDB also returns a vector
|
||
and also the "_distance" column which is the distance between the query
|
||
vector and the returned vector.
|
||
"""
|
||
return self.to_pandas()
|
||
|
||
def to_pandas(
|
||
self,
|
||
flatten: Optional[Union[int, bool]] = None,
|
||
*,
|
||
timeout: Optional[timedelta] = None,
|
||
) -> "pd.DataFrame":
|
||
"""
|
||
Execute the query and return the results as a pandas DataFrame.
|
||
In addition to the selected columns, LanceDB also returns a vector
|
||
and also the "_distance" column which is the distance between the query
|
||
vector and the returned vector.
|
||
|
||
Parameters
|
||
----------
|
||
flatten: Optional[Union[int, bool]]
|
||
If flatten is True, flatten all nested columns.
|
||
If flatten is an integer, flatten the nested columns up to the
|
||
specified depth.
|
||
If unspecified, do not flatten the nested columns.
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If None, wait indefinitely.
|
||
"""
|
||
tbl = flatten_columns(self.to_arrow(timeout=timeout), flatten)
|
||
return tbl.to_pandas()
|
||
|
||
@abstractmethod
|
||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||
"""
|
||
Execute the query and return the results as an
|
||
[Apache Arrow Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table).
|
||
|
||
In addition to the selected columns, LanceDB also returns a vector
|
||
and also the "_distance" column which is the distance between the query
|
||
vector and the returned vectors.
|
||
|
||
Parameters
|
||
----------
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If None, wait indefinitely.
|
||
"""
|
||
raise NotImplementedError
|
||
|
||
@abstractmethod
|
||
def to_batches(
|
||
self,
|
||
/,
|
||
batch_size: Optional[int] = None,
|
||
*,
|
||
timeout: Optional[timedelta] = None,
|
||
) -> pa.RecordBatchReader:
|
||
"""
|
||
Execute the query and return the results as a pyarrow
|
||
[RecordBatchReader](https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html)
|
||
|
||
Parameters
|
||
----------
|
||
batch_size: int
|
||
The maximum number of selected records in a RecordBatch object.
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If None, wait indefinitely.
|
||
"""
|
||
raise NotImplementedError
|
||
|
||
def to_list(self, *, timeout: Optional[timedelta] = None) -> List[dict]:
|
||
"""
|
||
Execute the query and return the results as a list of dictionaries.
|
||
|
||
Each list entry is a dictionary with the selected column names as keys,
|
||
or all table columns if `select` is not called. The vector and the "_distance"
|
||
fields are returned whether or not they're explicitly selected.
|
||
|
||
Parameters
|
||
----------
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If None, wait indefinitely.
|
||
"""
|
||
return self.to_arrow(timeout=timeout).to_pylist()
|
||
|
||
def to_pydantic(
|
||
self, model: type[T], *, timeout: Optional[timedelta] = None
|
||
) -> list[T]:
|
||
"""Return the table as a list of pydantic models.
|
||
|
||
Parameters
|
||
----------
|
||
model: Type[LanceModel]
|
||
The pydantic model to use.
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If None, wait indefinitely.
|
||
|
||
Returns
|
||
-------
|
||
List[LanceModel]
|
||
"""
|
||
return [model(**row) for row in self.to_arrow(timeout=timeout).to_pylist()]
|
||
|
||
def to_polars(self, *, timeout: Optional[timedelta] = None) -> "pl.DataFrame":
|
||
"""
|
||
Execute the query and return the results as a Polars DataFrame.
|
||
In addition to the selected columns, LanceDB also returns a vector
|
||
and also the "_distance" column which is the distance between the query
|
||
vector and the returned vector.
|
||
|
||
Parameters
|
||
----------
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If None, wait indefinitely.
|
||
"""
|
||
import polars as pl
|
||
|
||
return pl.from_arrow(self.to_arrow(timeout=timeout))
|
||
|
||
def limit(self, limit: Union[int, None]) -> Self:
|
||
"""Set the maximum number of results to return.
|
||
|
||
Parameters
|
||
----------
|
||
limit: int
|
||
The maximum number of results to return.
|
||
The default query limit is 10 results.
|
||
For ANN/KNN queries, you must specify a limit.
|
||
For plain searches, all records are returned if limit not set.
|
||
*WARNING* if you have a large dataset, setting
|
||
the limit to a large number, e.g. the table size,
|
||
can potentially result in reading a
|
||
large amount of data into memory and cause
|
||
out of memory issues.
|
||
|
||
Returns
|
||
-------
|
||
LanceQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
if limit is None or limit <= 0:
|
||
if isinstance(self, LanceVectorQueryBuilder):
|
||
raise ValueError("Limit is required for ANN/KNN queries")
|
||
else:
|
||
self._limit = None
|
||
else:
|
||
self._limit = limit
|
||
return self
|
||
|
||
def offset(self, offset: int) -> Self:
|
||
"""Set the offset for the results.
|
||
|
||
Parameters
|
||
----------
|
||
offset: int
|
||
The offset to start fetching results from.
|
||
|
||
Returns
|
||
-------
|
||
LanceQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
if offset is None or offset <= 0:
|
||
self._offset = 0
|
||
else:
|
||
self._offset = offset
|
||
return self
|
||
|
||
def select(self, columns: Union[list[str], dict[str, str]]) -> Self:
|
||
"""Set the columns to return.
|
||
|
||
Parameters
|
||
----------
|
||
columns: list of str, or dict of str to str default None
|
||
List of column names to be fetched.
|
||
Or a dictionary of column names to SQL expressions.
|
||
All columns are fetched if None or unspecified.
|
||
|
||
Returns
|
||
-------
|
||
LanceQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
if isinstance(columns, list) or isinstance(columns, dict):
|
||
self._columns = columns
|
||
else:
|
||
raise ValueError("columns must be a list or a dictionary")
|
||
return self
|
||
|
||
def where(self, where: str, prefilter: bool = True) -> Self:
|
||
"""Set the where clause.
|
||
|
||
Parameters
|
||
----------
|
||
where: str
|
||
The where clause which is a valid SQL where clause. See
|
||
`Lance filter pushdown <https://lance.org/guide/read_and_write#filter-push-down>`_
|
||
for valid SQL expressions.
|
||
prefilter: bool, default True
|
||
If True, apply the filter before vector search, otherwise the
|
||
filter is applied on the result of vector search.
|
||
This feature is **EXPERIMENTAL** and may be removed and modified
|
||
without warning in the future.
|
||
|
||
Returns
|
||
-------
|
||
LanceQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
self._where = where
|
||
self._postfilter = not prefilter
|
||
return self
|
||
|
||
def with_row_id(self, with_row_id: bool) -> Self:
|
||
"""Set whether to return row ids.
|
||
|
||
Parameters
|
||
----------
|
||
with_row_id: bool
|
||
If True, return _rowid column in the results.
|
||
|
||
Returns
|
||
-------
|
||
LanceQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
self._with_row_id = with_row_id
|
||
return self
|
||
|
||
def explain_plan(self, verbose: Optional[bool] = False) -> str:
|
||
"""Return the execution plan for this query.
|
||
|
||
Examples
|
||
--------
|
||
>>> import lancedb
|
||
>>> db = lancedb.connect("./.lancedb")
|
||
>>> table = db.create_table("my_table", [{"vector": [99.0, 99]}])
|
||
>>> query = [100, 100]
|
||
>>> plan = table.search(query).explain_plan(True)
|
||
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
||
GlobalLimitExec: skip=0, fetch=10
|
||
FilterExec: _distance@2 IS NOT NULL
|
||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
|
||
KNNVectorDistance: metric=l2
|
||
LanceRead: uri=..., projection=[vector], ...
|
||
|
||
Parameters
|
||
----------
|
||
verbose : bool, default False
|
||
Use a verbose output format.
|
||
|
||
Returns
|
||
-------
|
||
plan : str
|
||
""" # noqa: E501
|
||
return self._table._explain_plan(self.to_query_object(), verbose=verbose)
|
||
|
||
def analyze_plan(self) -> str:
|
||
"""
|
||
Run the query and return its execution plan with runtime metrics.
|
||
|
||
This returns detailed metrics for each step, such as elapsed time,
|
||
rows processed, bytes read, and I/O stats. It is useful for debugging
|
||
and performance tuning.
|
||
|
||
Examples
|
||
--------
|
||
>>> import lancedb
|
||
>>> db = lancedb.connect("./.lancedb")
|
||
>>> table = db.create_table("my_table", [{"vector": [99.0, 99]}])
|
||
>>> query = [100, 100]
|
||
>>> plan = table.search(query).analyze_plan()
|
||
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||
AnalyzeExec verbose=true, elapsed=..., metrics=...
|
||
TracedExec, elapsed=..., metrics=...
|
||
ProjectionExec: elapsed=..., expr=[...],
|
||
metrics=[output_rows=..., elapsed_compute=..., output_bytes=...]
|
||
GlobalLimitExec: elapsed=..., skip=0, fetch=10,
|
||
metrics=[output_rows=..., elapsed_compute=..., output_bytes=...]
|
||
FilterExec: elapsed=..., _distance@2 IS NOT NULL, metrics=[...]
|
||
SortExec: elapsed=..., TopK(fetch=10), expr=[...],
|
||
preserve_partitioning=[...],
|
||
metrics=[output_rows=..., elapsed_compute=...,
|
||
output_bytes=..., row_replacements=...]
|
||
KNNVectorDistance: elapsed=..., metric=l2,
|
||
metrics=[output_rows=..., elapsed_compute=...,
|
||
output_bytes=..., output_batches=...]
|
||
LanceRead: elapsed=..., uri=..., projection=[vector],
|
||
num_fragments=..., range_before=None, range_after=None,
|
||
row_id=true, row_addr=false,
|
||
full_filter=--, refine_filter=--,
|
||
metrics=[output_rows=..., elapsed_compute=..., output_bytes=...,
|
||
fragments_scanned=..., ranges_scanned=1, rows_scanned=1,
|
||
bytes_read=..., iops=..., requests=..., task_wait_time=...]
|
||
|
||
Returns
|
||
-------
|
||
plan : str
|
||
The physical query execution plan with runtime metrics.
|
||
"""
|
||
return self._table._analyze_plan(self.to_query_object())
|
||
|
||
def vector(self, vector: Union[np.ndarray, list]) -> Self:
|
||
"""Set the vector to search for.
|
||
|
||
Parameters
|
||
----------
|
||
vector: np.ndarray or list
|
||
The vector to search for.
|
||
|
||
Returns
|
||
-------
|
||
LanceQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
raise NotImplementedError
|
||
|
||
def text(self, text: str | FullTextQuery) -> Self:
|
||
"""Set the text to search for.
|
||
|
||
Parameters
|
||
----------
|
||
text: str | FullTextQuery
|
||
If a string, it is treated as a MatchQuery.
|
||
If a FullTextQuery object, it is used directly.
|
||
|
||
Returns
|
||
-------
|
||
LanceQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
raise NotImplementedError
|
||
|
||
@abstractmethod
|
||
def rerank(self, reranker: Reranker) -> Self:
|
||
"""Rerank the results using the specified reranker.
|
||
|
||
Parameters
|
||
----------
|
||
reranker: Reranker
|
||
The reranker to use.
|
||
|
||
Returns
|
||
-------
|
||
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
raise NotImplementedError
|
||
|
||
@abstractmethod
|
||
def to_query_object(self) -> Query:
|
||
"""Return a serializable representation of the query
|
||
|
||
Returns
|
||
-------
|
||
Query
|
||
The serializable representation of the query
|
||
"""
|
||
raise NotImplementedError
|
||
|
||
|
||
class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||
"""
|
||
Examples
|
||
--------
|
||
>>> import lancedb
|
||
>>> data = [{"vector": [1.1, 1.2], "b": 2},
|
||
... {"vector": [0.5, 1.3], "b": 4},
|
||
... {"vector": [0.4, 0.4], "b": 6},
|
||
... {"vector": [0.4, 0.4], "b": 10}]
|
||
>>> db = lancedb.connect("./.lancedb")
|
||
>>> table = db.create_table("my_table", data=data)
|
||
>>> (table.search([0.4, 0.4])
|
||
... .distance_type("cosine")
|
||
... .where("b < 10")
|
||
... .select(["b", "vector"])
|
||
... .limit(2)
|
||
... .to_pandas())
|
||
b vector _distance
|
||
0 6 [0.4, 0.4] 0.000000
|
||
1 2 [1.1, 1.2] 0.000944
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
table: "Table",
|
||
query: Union[np.ndarray, list, "PIL.Image.Image"],
|
||
vector_column: str,
|
||
str_query: Optional[str] = None,
|
||
fast_search: bool = None,
|
||
):
|
||
super().__init__(table)
|
||
self._query = query
|
||
self._distance_type = None
|
||
self._minimum_nprobes = None
|
||
self._maximum_nprobes = None
|
||
self._lower_bound = None
|
||
self._upper_bound = None
|
||
self._refine_factor = None
|
||
self._vector_column = vector_column
|
||
self._postfilter = None
|
||
self._reranker = None
|
||
self._str_query = str_query
|
||
self._fast_search = fast_search
|
||
|
||
def metric(self, metric: Literal["l2", "cosine", "dot"]) -> LanceVectorQueryBuilder:
|
||
"""Set the distance metric to use.
|
||
|
||
This is an alias for distance_type() and may be deprecated in the future.
|
||
Please use distance_type() instead.
|
||
|
||
Parameters
|
||
----------
|
||
metric: "l2" or "cosine" or "dot"
|
||
The distance metric to use. By default "l2" is used.
|
||
|
||
Returns
|
||
-------
|
||
LanceVectorQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
return self.distance_type(metric)
|
||
|
||
def distance_type(
|
||
self, distance_type: Literal["l2", "cosine", "dot"]
|
||
) -> "LanceVectorQueryBuilder":
|
||
"""Set the distance metric to use.
|
||
|
||
When performing a vector search we try and find the "nearest" vectors according
|
||
to some kind of distance metric. This parameter controls which distance metric
|
||
to use.
|
||
|
||
Note: if there is a vector index then the distance type used MUST match the
|
||
distance type used to train the vector index. If this is not done then the
|
||
results will be invalid.
|
||
|
||
Parameters
|
||
----------
|
||
distance_type: "l2" or "cosine" or "dot"
|
||
The distance metric to use. By default "l2" is used.
|
||
|
||
Returns
|
||
-------
|
||
LanceVectorQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
self._distance_type = distance_type.lower()
|
||
return self
|
||
|
||
def nprobes(self, nprobes: int) -> LanceVectorQueryBuilder:
|
||
"""Set the number of probes to use.
|
||
|
||
Higher values will yield better recall (more likely to find vectors if
|
||
they exist) at the expense of latency.
|
||
|
||
See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
||
tuning advice.
|
||
|
||
This method sets both the minimum and maximum number of probes to the same
|
||
value. See `minimum_nprobes` and `maximum_nprobes` for more fine-grained
|
||
control.
|
||
|
||
Parameters
|
||
----------
|
||
nprobes: int
|
||
The number of probes to use.
|
||
|
||
Returns
|
||
-------
|
||
LanceVectorQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
self._minimum_nprobes = nprobes
|
||
self._maximum_nprobes = nprobes
|
||
return self
|
||
|
||
def minimum_nprobes(self, minimum_nprobes: int) -> LanceVectorQueryBuilder:
|
||
"""Set the minimum number of probes to use.
|
||
|
||
See `nprobes` for more details.
|
||
|
||
These partitions will be searched on every vector query and will increase recall
|
||
at the expense of latency.
|
||
"""
|
||
self._minimum_nprobes = minimum_nprobes
|
||
return self
|
||
|
||
def maximum_nprobes(self, maximum_nprobes: int) -> LanceVectorQueryBuilder:
|
||
"""Set the maximum number of probes to use.
|
||
|
||
See `nprobes` for more details.
|
||
|
||
If this value is greater than `minimum_nprobes` then the excess partitions
|
||
will be searched only if we have not found enough results.
|
||
|
||
This can be useful when there is a narrow filter to allow these queries to
|
||
spend more time searching and avoid potential false negatives.
|
||
|
||
If this value is 0 then no limit will be applied and all partitions could be
|
||
searched if needed to satisfy the limit.
|
||
"""
|
||
self._maximum_nprobes = maximum_nprobes
|
||
return self
|
||
|
||
def distance_range(
|
||
self, lower_bound: Optional[float] = None, upper_bound: Optional[float] = None
|
||
) -> LanceVectorQueryBuilder:
|
||
"""Set the distance range to use.
|
||
|
||
Only rows with distances within range [lower_bound, upper_bound)
|
||
will be returned.
|
||
|
||
Parameters
|
||
----------
|
||
lower_bound: Optional[float]
|
||
The lower bound of the distance range.
|
||
upper_bound: Optional[float]
|
||
The upper bound of the distance range.
|
||
|
||
Returns
|
||
-------
|
||
LanceVectorQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
self._lower_bound = lower_bound
|
||
self._upper_bound = upper_bound
|
||
return self
|
||
|
||
def ef(self, ef: int) -> LanceVectorQueryBuilder:
|
||
"""Set the number of candidates to consider during search.
|
||
|
||
Higher values will yield better recall (more likely to find vectors if
|
||
they exist) at the expense of latency.
|
||
|
||
This only applies to the HNSW-related index.
|
||
The default value is 1.5 * limit.
|
||
|
||
Parameters
|
||
----------
|
||
ef: int
|
||
The number of candidates to consider during search.
|
||
|
||
Returns
|
||
-------
|
||
LanceVectorQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
self._ef = ef
|
||
return self
|
||
|
||
def refine_factor(self, refine_factor: int) -> LanceVectorQueryBuilder:
|
||
"""Set the refine factor to use, increasing the number of vectors sampled.
|
||
|
||
As an example, a refine factor of 2 will sample 2x as many vectors as
|
||
requested, re-ranks them, and returns the top half most relevant results.
|
||
|
||
See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
||
tuning advice.
|
||
|
||
Parameters
|
||
----------
|
||
refine_factor: int
|
||
The refine factor to use.
|
||
|
||
Returns
|
||
-------
|
||
LanceVectorQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
self._refine_factor = refine_factor
|
||
return self
|
||
|
||
def output_schema(self) -> pa.Schema:
|
||
"""
|
||
Return the output schema for the query
|
||
|
||
This does not execute the query.
|
||
"""
|
||
return self._table._output_schema(self.to_query_object())
|
||
|
||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||
"""
|
||
Execute the query and return the results as an
|
||
[Apache Arrow Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table).
|
||
|
||
In addition to the selected columns, LanceDB also returns a vector
|
||
and also the "_distance" column which is the distance between the query
|
||
vector and the returned vectors.
|
||
|
||
Parameters
|
||
----------
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If None, wait indefinitely.
|
||
"""
|
||
return self.to_batches(timeout=timeout).read_all()
|
||
|
||
def to_query_object(self) -> Query:
|
||
"""
|
||
Build a Query object
|
||
|
||
This can be used to serialize a query
|
||
"""
|
||
vector = self._query if isinstance(self._query, list) else self._query.tolist()
|
||
if isinstance(vector[0], np.ndarray):
|
||
vector = [v.tolist() for v in vector]
|
||
return Query(
|
||
vector=vector,
|
||
filter=self._where,
|
||
postfilter=self._postfilter,
|
||
limit=self._limit,
|
||
distance_type=self._distance_type,
|
||
columns=self._columns,
|
||
minimum_nprobes=self._minimum_nprobes,
|
||
maximum_nprobes=self._maximum_nprobes,
|
||
lower_bound=self._lower_bound,
|
||
upper_bound=self._upper_bound,
|
||
refine_factor=self._refine_factor,
|
||
vector_column=self._vector_column,
|
||
with_row_id=self._with_row_id,
|
||
offset=self._offset,
|
||
fast_search=self._fast_search,
|
||
ef=self._ef,
|
||
bypass_vector_index=self._bypass_vector_index,
|
||
)
|
||
|
||
def to_batches(
|
||
self,
|
||
/,
|
||
batch_size: Optional[int] = None,
|
||
*,
|
||
timeout: Optional[timedelta] = None,
|
||
) -> pa.RecordBatchReader:
|
||
"""
|
||
Execute the query and return the result as a RecordBatchReader object.
|
||
|
||
Parameters
|
||
----------
|
||
batch_size: int
|
||
The maximum number of selected records in a RecordBatch object.
|
||
timeout: timedelta, default None
|
||
The maximum time to wait for the query to complete.
|
||
If None, wait indefinitely.
|
||
|
||
Returns
|
||
-------
|
||
pa.RecordBatchReader
|
||
"""
|
||
vector = self._query if isinstance(self._query, list) else self._query.tolist()
|
||
if isinstance(vector[0], np.ndarray):
|
||
vector = [v.tolist() for v in vector]
|
||
query = self.to_query_object()
|
||
result_set = self._table._execute_query(
|
||
query, batch_size=batch_size, timeout=timeout
|
||
)
|
||
if self._reranker is not None:
|
||
rs_table = result_set.read_all()
|
||
result_set = self._reranker.rerank_vector(self._str_query, rs_table)
|
||
check_reranker_result(result_set)
|
||
# convert result_set back to RecordBatchReader
|
||
result_set = pa.RecordBatchReader.from_batches(
|
||
result_set.schema, result_set.to_batches()
|
||
)
|
||
|
||
return result_set
|
||
|
||
def where(self, where: str, prefilter: bool = None) -> LanceVectorQueryBuilder:
|
||
"""Set the where clause.
|
||
|
||
Parameters
|
||
----------
|
||
where: str
|
||
The where clause which is a valid SQL where clause. See
|
||
`Lance filter pushdown <https://lance.org/guide/read_and_write#filter-push-down>`_
|
||
for valid SQL expressions.
|
||
prefilter: bool, default True
|
||
If True, apply the filter before vector search, otherwise the
|
||
filter is applied on the result of vector search.
|
||
|
||
Returns
|
||
-------
|
||
LanceQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
self._where = where
|
||
if prefilter is not None:
|
||
self._postfilter = not prefilter
|
||
return self
|
||
|
||
def rerank(
|
||
self, reranker: Reranker, query_string: Optional[str] = None
|
||
) -> LanceVectorQueryBuilder:
|
||
"""Rerank the results using the specified reranker.
|
||
|
||
Parameters
|
||
----------
|
||
reranker: Reranker
|
||
The reranker to use.
|
||
|
||
query_string: Optional[str]
|
||
The query to use for reranking. This needs to be specified explicitly here
|
||
as the query used for vector search may already be vectorized and the
|
||
reranker requires a string query.
|
||
This is only required if the query used for vector search is not a string.
|
||
Note: This doesn't yet support the case where the query is multimodal or a
|
||
list of vectors.
|
||
|
||
Returns
|
||
-------
|
||
LanceVectorQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
self._reranker = reranker
|
||
if self._str_query is None and query_string is None:
|
||
raise ValueError(
|
||
"""
|
||
The query used for vector search is not a string.
|
||
In this case, the reranker query needs to be specified explicitly.
|
||
"""
|
||
)
|
||
if query_string is not None and not isinstance(query_string, str):
|
||
raise ValueError("Reranking currently only supports string queries")
|
||
self._str_query = query_string if query_string is not None else self._str_query
|
||
if reranker.score == "all":
|
||
self.with_row_id(True)
|
||
return self
|
||
|
||
def bypass_vector_index(self) -> LanceVectorQueryBuilder:
|
||
"""
|
||
If this is called then any vector index is skipped
|
||
|
||
An exhaustive (flat) search will be performed. The query vector will
|
||
be compared to every vector in the table. At high scales this can be
|
||
expensive. However, this is often still useful. For example, skipping
|
||
the vector index can give you ground truth results which you can use to
|
||
calculate your recall to select an appropriate value for nprobes.
|
||
|
||
Returns
|
||
-------
|
||
LanceVectorQueryBuilder
|
||
The LanceVectorQueryBuilder object.
|
||
"""
|
||
self._bypass_vector_index = True
|
||
return self
|
||
|
||
def fast_search(self) -> LanceVectorQueryBuilder:
|
||
"""
|
||
Skip a flat search of unindexed data. This will improve
|
||
search performance but search results will not include unindexed data.
|
||
|
||
Returns
|
||
-------
|
||
LanceVectorQueryBuilder
|
||
The LanceVectorQueryBuilder object.
|
||
"""
|
||
self._fast_search = True
|
||
return self
|
||
|
||
|
||
class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||
"""A builder for full text search for LanceDB."""
|
||
|
||
def __init__(
|
||
self,
|
||
table: "Table",
|
||
query: str | FullTextQuery,
|
||
ordering_field_name: Optional[str] = None,
|
||
fts_columns: Optional[Union[str, List[str]]] = None,
|
||
fast_search: bool = None,
|
||
):
|
||
super().__init__(table)
|
||
self._query = query
|
||
self._phrase_query = False
|
||
self.ordering_field_name = ordering_field_name
|
||
self._reranker = None
|
||
self._fast_search = fast_search
|
||
if isinstance(fts_columns, str):
|
||
fts_columns = [fts_columns]
|
||
self._fts_columns = fts_columns
|
||
|
||
def phrase_query(self, phrase_query: bool = True) -> LanceFtsQueryBuilder:
|
||
"""Set whether to use phrase query.
|
||
|
||
Parameters
|
||
----------
|
||
phrase_query: bool, default True
|
||
If True, then the query will be wrapped in quotes and
|
||
double quotes replaced by single quotes.
|
||
|
||
Returns
|
||
-------
|
||
LanceFtsQueryBuilder
|
||
The LanceFtsQueryBuilder object.
|
||
"""
|
||
self._phrase_query = phrase_query
|
||
return self
|
||
|
||
def fast_search(self) -> LanceFtsQueryBuilder:
|
||
"""
|
||
Skip a flat search of unindexed data. This will improve
|
||
search performance but search results will not include unindexed data.
|
||
|
||
Returns
|
||
-------
|
||
LanceFtsQueryBuilder
|
||
The LanceFtsQueryBuilder object.
|
||
"""
|
||
self._fast_search = True
|
||
return self
|
||
|
||
def to_query_object(self) -> Query:
|
||
return Query(
|
||
columns=self._columns,
|
||
filter=self._where,
|
||
limit=self._limit,
|
||
postfilter=self._postfilter,
|
||
with_row_id=self._with_row_id,
|
||
full_text_query=FullTextSearchQuery(
|
||
query=self._query, columns=self._fts_columns
|
||
),
|
||
offset=self._offset,
|
||
fast_search=self._fast_search,
|
||
)
|
||
|
||
def output_schema(self) -> pa.Schema:
|
||
"""
|
||
Return the output schema for the query
|
||
|
||
This does not execute the query.
|
||
"""
|
||
return self._table._output_schema(self.to_query_object())
|
||
|
||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||
path, fs, exist = self._table._get_fts_index_path()
|
||
if exist:
|
||
return self.tantivy_to_arrow()
|
||
|
||
query = self._query
|
||
if self._phrase_query:
|
||
if isinstance(query, str):
|
||
if not query.startswith('"') or not query.endswith('"'):
|
||
self._query = f'"{query}"'
|
||
elif isinstance(query, FullTextQuery) and not isinstance(
|
||
query, PhraseQuery
|
||
):
|
||
raise TypeError("Please use PhraseQuery for phrase queries.")
|
||
query = self.to_query_object()
|
||
results = self._table._execute_query(query, timeout=timeout)
|
||
results = results.read_all()
|
||
if self._reranker is not None:
|
||
results = self._reranker.rerank_fts(self._query, results)
|
||
check_reranker_result(results)
|
||
return results
|
||
|
||
def to_batches(
|
||
self, /, batch_size: Optional[int] = None, timeout: Optional[timedelta] = None
|
||
):
|
||
raise NotImplementedError("to_batches on an FTS query")
|
||
|
||
def tantivy_to_arrow(self) -> pa.Table:
|
||
try:
|
||
import tantivy
|
||
except ImportError:
|
||
raise ImportError(
|
||
"Please install tantivy-py `pip install tantivy` to use the full text search feature." # noqa: E501
|
||
)
|
||
|
||
from .fts import search_index
|
||
|
||
# get the index path
|
||
path, fs, exist = self._table._get_fts_index_path()
|
||
|
||
# check if the index exist
|
||
if not exist:
|
||
raise FileNotFoundError(
|
||
"Fts index does not exist. "
|
||
"Please first call table.create_fts_index(['<field_names>']) to "
|
||
"create the fts index."
|
||
)
|
||
|
||
# Check that we are on local filesystem
|
||
if not isinstance(fs, pa_fs.LocalFileSystem):
|
||
raise NotImplementedError(
|
||
"Tantivy-based full text search "
|
||
"is only supported on the local filesystem"
|
||
)
|
||
# open the index
|
||
index = tantivy.Index.open(path)
|
||
# get the scores and doc ids
|
||
query = self._query
|
||
if self._phrase_query:
|
||
query = query.replace('"', "'")
|
||
query = f'"{query}"'
|
||
limit = self._limit if self._limit is not None else 10
|
||
row_ids, scores = search_index(
|
||
index, query, limit, ordering_field=self.ordering_field_name
|
||
)
|
||
if len(row_ids) == 0:
|
||
empty_schema = pa.schema([pa.field("_score", pa.float32())])
|
||
return pa.Table.from_batches([], schema=empty_schema)
|
||
scores = pa.array(scores)
|
||
output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
|
||
output_tbl = output_tbl.append_column("_score", scores)
|
||
# this needs to match vector search results which are uint64
|
||
row_ids = pa.array(row_ids, type=pa.uint64())
|
||
|
||
if self._where is not None:
|
||
tmp_name = "__lancedb__duckdb__indexer__"
|
||
output_tbl = output_tbl.append_column(
|
||
tmp_name, pa.array(range(len(output_tbl)))
|
||
)
|
||
try:
|
||
# TODO would be great to have Substrait generate pyarrow compute
|
||
# expressions or conversely have pyarrow support SQL expressions
|
||
# using Substrait
|
||
import duckdb
|
||
|
||
indexer = duckdb.sql(
|
||
f"SELECT {tmp_name} FROM output_tbl WHERE {self._where}"
|
||
).to_arrow_table()[tmp_name]
|
||
output_tbl = output_tbl.take(indexer).drop([tmp_name])
|
||
row_ids = row_ids.take(indexer)
|
||
|
||
except ImportError:
|
||
import tempfile
|
||
|
||
import lance
|
||
|
||
# TODO Use "memory://" instead once that's supported
|
||
with tempfile.TemporaryDirectory() as tmp:
|
||
ds = lance.write_dataset(output_tbl, tmp)
|
||
output_tbl = ds.to_table(filter=self._where)
|
||
indexer = output_tbl[tmp_name]
|
||
row_ids = row_ids.take(indexer)
|
||
output_tbl = output_tbl.drop([tmp_name])
|
||
|
||
if self._with_row_id:
|
||
output_tbl = output_tbl.append_column("_rowid", row_ids)
|
||
|
||
if self._reranker is not None:
|
||
output_tbl = self._reranker.rerank_fts(self._query, output_tbl)
|
||
return output_tbl
|
||
|
||
def rerank(self, reranker: Reranker) -> LanceFtsQueryBuilder:
|
||
"""Rerank the results using the specified reranker.
|
||
|
||
Parameters
|
||
----------
|
||
reranker: Reranker
|
||
The reranker to use.
|
||
|
||
Returns
|
||
-------
|
||
LanceFtsQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
self._reranker = reranker
|
||
if reranker.score == "all":
|
||
self.with_row_id(True)
|
||
return self
|
||
|
||
|
||
class LanceEmptyQueryBuilder(LanceQueryBuilder):
|
||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||
return self.to_batches(timeout=timeout).read_all()
|
||
|
||
def to_query_object(self) -> Query:
|
||
return Query(
|
||
columns=self._columns,
|
||
filter=self._where,
|
||
limit=self._limit,
|
||
with_row_id=self._with_row_id,
|
||
offset=self._offset,
|
||
)
|
||
|
||
def output_schema(self) -> pa.Schema:
|
||
query = self.to_query_object()
|
||
return self._table._output_schema(query)
|
||
|
||
def to_batches(
|
||
self, /, batch_size: Optional[int] = None, timeout: Optional[timedelta] = None
|
||
) -> pa.RecordBatchReader:
|
||
query = self.to_query_object()
|
||
return self._table._execute_query(query, batch_size=batch_size, timeout=timeout)
|
||
|
||
def rerank(self, reranker: Reranker) -> LanceEmptyQueryBuilder:
|
||
"""Rerank the results using the specified reranker.
|
||
|
||
Parameters
|
||
----------
|
||
reranker: Reranker
|
||
The reranker to use.
|
||
|
||
Returns
|
||
-------
|
||
LanceEmptyQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
raise NotImplementedError("Reranking is not yet supported.")
|
||
|
||
|
||
class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||
"""
|
||
A query builder that performs hybrid vector and full text search.
|
||
Results are combined and reranked based on the specified reranker.
|
||
By default, the results are reranked using the RRFReranker, which
|
||
uses reciprocal rank fusion score for reranking.
|
||
|
||
To make the vector and fts results comparable, the scores are normalized.
|
||
Instead of normalizing scores, the `normalize` parameter can be set to "rank"
|
||
in the `rerank` method to convert the scores to ranks and then normalize them.
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
table: "Table",
|
||
query: Optional[Union[str, FullTextQuery]] = None,
|
||
vector_column: Optional[str] = None,
|
||
fts_columns: Optional[Union[str, List[str]]] = None,
|
||
):
|
||
super().__init__(table)
|
||
self._query = query
|
||
self._vector_column = vector_column
|
||
self._fts_columns = fts_columns
|
||
self._norm = None
|
||
self._reranker = None
|
||
self._minimum_nprobes = None
|
||
self._maximum_nprobes = None
|
||
self._refine_factor = None
|
||
self._distance_type = None
|
||
self._phrase_query = None
|
||
self._lower_bound = None
|
||
self._upper_bound = None
|
||
|
||
def _validate_query(self, query, vector=None, text=None):
|
||
if query is not None and (vector is not None or text is not None):
|
||
raise ValueError(
|
||
"You can either provide a string query in search() method"
|
||
"or set `vector()` and `text()` explicitly for hybrid search."
|
||
"But not both."
|
||
)
|
||
|
||
vector_query = vector if vector is not None else query
|
||
if not isinstance(vector_query, (str, list, np.ndarray)):
|
||
raise ValueError("Vector query must be either a string or a vector")
|
||
|
||
text_query = text or query
|
||
if text_query is None:
|
||
raise ValueError("Text query must be provided for hybrid search.")
|
||
if not isinstance(text_query, (str, FullTextQuery)):
|
||
raise ValueError("Text query must be a string or FullTextQuery")
|
||
|
||
return vector_query, text_query
|
||
|
||
def phrase_query(self, phrase_query: bool = None) -> LanceHybridQueryBuilder:
|
||
"""Set whether to use phrase query.
|
||
|
||
Parameters
|
||
----------
|
||
phrase_query: bool, default True
|
||
If True, then the query will be wrapped in quotes and
|
||
double quotes replaced by single quotes.
|
||
|
||
Returns
|
||
-------
|
||
LanceHybridQueryBuilder
|
||
The LanceHybridQueryBuilder object.
|
||
"""
|
||
self._phrase_query = phrase_query
|
||
return self
|
||
|
||
def to_query_object(self) -> Query:
|
||
raise NotImplementedError("to_query_object not yet supported on a hybrid query")
|
||
|
||
def to_arrow(self, *, timeout: Optional[timedelta] = None) -> pa.Table:
|
||
self._create_query_builders()
|
||
with ThreadPoolExecutor() as executor:
|
||
fts_future = executor.submit(
|
||
self._fts_query.with_row_id(True).to_arrow, timeout=timeout
|
||
)
|
||
vector_future = executor.submit(
|
||
self._vector_query.with_row_id(True).to_arrow, timeout=timeout
|
||
)
|
||
fts_results = fts_future.result()
|
||
vector_results = vector_future.result()
|
||
|
||
return self._combine_hybrid_results(
|
||
fts_results=fts_results,
|
||
vector_results=vector_results,
|
||
norm=self._norm,
|
||
fts_query=self._fts_query._query,
|
||
reranker=self._reranker,
|
||
limit=self._limit,
|
||
with_row_ids=self._with_row_id,
|
||
)
|
||
|
||
@staticmethod
|
||
def _combine_hybrid_results(
|
||
fts_results: pa.Table,
|
||
vector_results: pa.Table,
|
||
norm: str,
|
||
fts_query: str,
|
||
reranker,
|
||
limit: int,
|
||
with_row_ids: bool,
|
||
) -> pa.Table:
|
||
if norm == "rank":
|
||
vector_results = LanceHybridQueryBuilder._rank(vector_results, "_distance")
|
||
fts_results = LanceHybridQueryBuilder._rank(fts_results, "_score")
|
||
|
||
# If both result sets are empty (e.g. after hard filtering),
|
||
# return early to avoid errors in reranking or score restoration.
|
||
if vector_results.num_rows == 0 and fts_results.num_rows == 0:
|
||
# Build a minimal empty table with the _relevance_score column
|
||
combined_schema = pa.unify_schemas(
|
||
[vector_results.schema, fts_results.schema],
|
||
)
|
||
empty = pa.table(
|
||
{
|
||
col: pa.array([], type=combined_schema.field(col).type)
|
||
for col in combined_schema.names
|
||
}
|
||
)
|
||
empty = empty.append_column(
|
||
"_relevance_score", pa.array([], type=pa.float32())
|
||
)
|
||
if not with_row_ids and "_rowid" in empty.column_names:
|
||
empty = empty.drop(["_rowid"])
|
||
return empty
|
||
|
||
original_distances = None
|
||
original_scores = None
|
||
original_distance_row_ids = None
|
||
original_score_row_ids = None
|
||
# normalize the scores to be between 0 and 1, 0 being most relevant
|
||
# We check whether the results (vector and FTS) are empty, because when
|
||
# they are, they often are missing the _rowid column, which causes an error
|
||
if vector_results.num_rows > 0:
|
||
distance_i = vector_results.column_names.index("_distance")
|
||
original_distances = vector_results.column(distance_i)
|
||
original_distance_row_ids = vector_results.column("_rowid")
|
||
vector_results = vector_results.set_column(
|
||
distance_i,
|
||
vector_results.field(distance_i),
|
||
LanceHybridQueryBuilder._normalize_scores(original_distances),
|
||
)
|
||
|
||
# In fts higher scores represent relevance. Not inverting them here as
|
||
# rerankers might need to preserve this score to support `return_score="all"`
|
||
if fts_results.num_rows > 0:
|
||
score_i = fts_results.column_names.index("_score")
|
||
original_scores = fts_results.column(score_i)
|
||
original_score_row_ids = fts_results.column("_rowid")
|
||
fts_results = fts_results.set_column(
|
||
score_i,
|
||
fts_results.field(score_i),
|
||
LanceHybridQueryBuilder._normalize_scores(original_scores),
|
||
)
|
||
|
||
results = reranker.rerank_hybrid(fts_query, vector_results, fts_results)
|
||
|
||
check_reranker_result(results)
|
||
|
||
if "_distance" in results.column_names and original_distances is not None:
|
||
# restore the original distances
|
||
indices = pc.index_in(
|
||
results["_rowid"], original_distance_row_ids, skip_nulls=True
|
||
)
|
||
original_distances = pc.take(original_distances, indices)
|
||
distance_i = results.column_names.index("_distance")
|
||
results = results.set_column(distance_i, "_distance", original_distances)
|
||
|
||
if "_score" in results.column_names and original_scores is not None:
|
||
# restore the original scores
|
||
indices = pc.index_in(
|
||
results["_rowid"], original_score_row_ids, skip_nulls=True
|
||
)
|
||
original_scores = pc.take(original_scores, indices)
|
||
score_i = results.column_names.index("_score")
|
||
results = results.set_column(score_i, "_score", original_scores)
|
||
|
||
results = results.slice(length=limit)
|
||
|
||
if not with_row_ids:
|
||
results = results.drop(["_rowid"])
|
||
|
||
return results
|
||
|
||
def to_batches(
|
||
self, /, batch_size: Optional[int] = None, timeout: Optional[timedelta] = None
|
||
):
|
||
raise NotImplementedError("to_batches not yet supported on a hybrid query")
|
||
|
||
@staticmethod
|
||
def _rank(results: pa.Table, column: str, ascending: bool = True):
|
||
if len(results) == 0:
|
||
return results
|
||
# Get the _score column from results
|
||
scores = results.column(column).to_numpy()
|
||
sort_indices = np.argsort(scores)
|
||
if not ascending:
|
||
sort_indices = sort_indices[::-1]
|
||
ranks = np.empty_like(sort_indices)
|
||
ranks[sort_indices] = np.arange(len(scores)) + 1
|
||
# replace the _score column with the ranks
|
||
_score_idx = results.column_names.index(column)
|
||
results = results.set_column(
|
||
_score_idx, column, pa.array(ranks, type=pa.float32())
|
||
)
|
||
return results
|
||
|
||
@staticmethod
|
||
def _normalize_scores(scores: pa.Array, invert=False) -> pa.Array:
|
||
if len(scores) == 0:
|
||
return scores
|
||
# normalize the scores by subtracting the min and dividing by the max
|
||
min, max = pc.min_max(scores).values()
|
||
rng = pc.subtract(max, min)
|
||
|
||
if not pc.equal(rng, pa.scalar(0.0)).as_py():
|
||
scores = pc.divide(pc.subtract(scores, min), rng)
|
||
elif not pc.equal(max, pa.scalar(0.0)).as_py():
|
||
# If rng is 0, then we at least want the scores to be 0
|
||
scores = pc.subtract(scores, min)
|
||
|
||
if invert:
|
||
scores = pc.subtract(1, scores)
|
||
|
||
return scores
|
||
|
||
def rerank(
|
||
self,
|
||
reranker: Reranker = RRFReranker(),
|
||
normalize: str = "score",
|
||
) -> LanceHybridQueryBuilder:
|
||
"""
|
||
Rerank the hybrid search results using the specified reranker. The reranker
|
||
must be an instance of Reranker class.
|
||
|
||
Parameters
|
||
----------
|
||
reranker: Reranker, default RRFReranker()
|
||
The reranker to use. Must be an instance of Reranker class.
|
||
normalize: str, default "score"
|
||
The method to normalize the scores. Can be "rank" or "score". If "rank",
|
||
the scores are converted to ranks and then normalized. If "score", the
|
||
scores are normalized directly.
|
||
Returns
|
||
-------
|
||
LanceHybridQueryBuilder
|
||
The LanceHybridQueryBuilder object.
|
||
"""
|
||
if normalize not in ["rank", "score"]:
|
||
raise ValueError("normalize must be 'rank' or 'score'.")
|
||
if reranker and not isinstance(reranker, Reranker):
|
||
raise ValueError("reranker must be an instance of Reranker class.")
|
||
|
||
self._norm = normalize
|
||
self._reranker = reranker
|
||
if reranker.score == "all":
|
||
self.with_row_id(True)
|
||
|
||
return self
|
||
|
||
def nprobes(self, nprobes: int) -> LanceHybridQueryBuilder:
|
||
"""
|
||
Set the number of probes to use for vector search.
|
||
|
||
Higher values will yield better recall (more likely to find vectors if
|
||
they exist) at the expense of latency.
|
||
|
||
Parameters
|
||
----------
|
||
nprobes: int
|
||
The number of probes to use.
|
||
|
||
Returns
|
||
-------
|
||
LanceHybridQueryBuilder
|
||
The LanceHybridQueryBuilder object.
|
||
"""
|
||
self._minimum_nprobes = nprobes
|
||
self._maximum_nprobes = nprobes
|
||
return self
|
||
|
||
def minimum_nprobes(self, minimum_nprobes: int) -> LanceHybridQueryBuilder:
|
||
"""Set the minimum number of probes to use.
|
||
|
||
See `nprobes` for more details.
|
||
"""
|
||
self._minimum_nprobes = minimum_nprobes
|
||
return self
|
||
|
||
def maximum_nprobes(self, maximum_nprobes: int) -> LanceHybridQueryBuilder:
|
||
"""Set the maximum number of probes to use.
|
||
|
||
See `nprobes` for more details.
|
||
"""
|
||
self._maximum_nprobes = maximum_nprobes
|
||
return self
|
||
|
||
def distance_range(
|
||
self, lower_bound: Optional[float] = None, upper_bound: Optional[float] = None
|
||
) -> LanceHybridQueryBuilder:
|
||
"""
|
||
Set the distance range to use.
|
||
|
||
Only rows with distances within range [lower_bound, upper_bound)
|
||
will be returned.
|
||
|
||
Parameters
|
||
----------
|
||
lower_bound: Optional[float]
|
||
The lower bound of the distance range.
|
||
upper_bound: Optional[float]
|
||
The upper bound of the distance range.
|
||
|
||
Returns
|
||
-------
|
||
LanceHybridQueryBuilder
|
||
The LanceHybridQueryBuilder object.
|
||
"""
|
||
self._lower_bound = lower_bound
|
||
self._upper_bound = upper_bound
|
||
return self
|
||
|
||
def ef(self, ef: int) -> LanceHybridQueryBuilder:
|
||
"""
|
||
Set the number of candidates to consider during search.
|
||
|
||
Higher values will yield better recall (more likely to find vectors if
|
||
they exist) at the expense of latency.
|
||
|
||
This only applies to the HNSW-related index.
|
||
The default value is 1.5 * limit.
|
||
|
||
Parameters
|
||
----------
|
||
ef: int
|
||
The number of candidates to consider during search.
|
||
|
||
Returns
|
||
-------
|
||
LanceHybridQueryBuilder
|
||
The LanceHybridQueryBuilder object.
|
||
"""
|
||
self._ef = ef
|
||
return self
|
||
|
||
def metric(self, metric: Literal["l2", "cosine", "dot"]) -> LanceHybridQueryBuilder:
|
||
"""Set the distance metric to use.
|
||
|
||
This is an alias for distance_type() and may be deprecated in the future.
|
||
Please use distance_type() instead.
|
||
|
||
Parameters
|
||
----------
|
||
metric: "l2" or "cosine" or "dot"
|
||
The distance metric to use. By default "l2" is used.
|
||
|
||
Returns
|
||
-------
|
||
LanceVectorQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
return self.distance_type(metric)
|
||
|
||
def distance_type(
|
||
self, distance_type: Literal["l2", "cosine", "dot"]
|
||
) -> "LanceHybridQueryBuilder":
|
||
"""Set the distance metric to use.
|
||
|
||
When performing a vector search we try and find the "nearest" vectors according
|
||
to some kind of distance metric. This parameter controls which distance metric
|
||
to use.
|
||
|
||
Note: if there is a vector index then the distance type used MUST match the
|
||
distance type used to train the vector index. If this is not done then the
|
||
results will be invalid.
|
||
|
||
Parameters
|
||
----------
|
||
distance_type: "l2" or "cosine" or "dot"
|
||
The distance metric to use. By default "l2" is used.
|
||
|
||
Returns
|
||
-------
|
||
LanceVectorQueryBuilder
|
||
The LanceQueryBuilder object.
|
||
"""
|
||
self._distance_type = distance_type.lower()
|
||
return self
|
||
|
||
def refine_factor(self, refine_factor: int) -> LanceHybridQueryBuilder:
|
||
"""
|
||
Refine the vector search results by reading extra elements and
|
||
re-ranking them in memory.
|
||
|
||
Parameters
|
||
----------
|
||
refine_factor: int
|
||
The refine factor to use.
|
||
|
||
Returns
|
||
-------
|
||
LanceHybridQueryBuilder
|
||
The LanceHybridQueryBuilder object.
|
||
"""
|
||
self._refine_factor = refine_factor
|
||
return self
|
||
|
||
def vector(self, vector: Union[np.ndarray, list]) -> LanceHybridQueryBuilder:
|
||
self._vector = vector
|
||
return self
|
||
|
||
def text(self, text: str | FullTextQuery) -> LanceHybridQueryBuilder:
|
||
self._text = text
|
||
return self
|
||
|
||
def bypass_vector_index(self) -> LanceHybridQueryBuilder:
|
||
"""
|
||
If this is called then any vector index is skipped
|
||
|
||
An exhaustive (flat) search will be performed. The query vector will
|
||
be compared to every vector in the table. At high scales this can be
|
||
expensive. However, this is often still useful. For example, skipping
|
||
the vector index can give you ground truth results which you can use to
|
||
calculate your recall to select an appropriate value for nprobes.
|
||
|
||
Returns
|
||
-------
|
||
LanceHybridQueryBuilder
|
||
The LanceHybridQueryBuilder object.
|
||
"""
|
||
self._bypass_vector_index = True
|
||
return self
|
||
|
||
def explain_plan(self, verbose: Optional[bool] = False) -> str:
|
||
"""Return the execution plan for this query.
|
||
|
||
Examples
|
||
--------
|
||
>>> import lancedb
|
||
>>> db = lancedb.connect("./.lancedb")
|
||
>>> table = db.create_table("my_table", [{"vector": [99.0, 99]}])
|
||
>>> query = [100, 100]
|
||
>>> plan = table.search(query).explain_plan(True)
|
||
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
||
GlobalLimitExec: skip=0, fetch=10
|
||
FilterExec: _distance@2 IS NOT NULL
|
||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
|
||
KNNVectorDistance: metric=l2
|
||
LanceRead: uri=..., projection=[vector], ...
|
||
|
||
Parameters
|
||
----------
|
||
verbose : bool, default False
|
||
Use a verbose output format.
|
||
|
||
Returns
|
||
-------
|
||
plan : str
|
||
""" # noqa: E501
|
||
self._create_query_builders()
|
||
|
||
reranker_label = str(self._reranker) if self._reranker else "No reranker"
|
||
vector_plan = self._table._explain_plan(
|
||
self._vector_query.to_query_object(), verbose=verbose
|
||
)
|
||
fts_plan = self._table._explain_plan(
|
||
self._fts_query.to_query_object(), verbose=verbose
|
||
)
|
||
# Indent sub-plans under the reranker
|
||
indented_vector = "\n".join(" " + line for line in vector_plan.splitlines())
|
||
indented_fts = "\n".join(" " + line for line in fts_plan.splitlines())
|
||
return f"{reranker_label}\n {indented_vector}\n {indented_fts}"
|
||
|
||
def analyze_plan(self):
|
||
"""Execute the query and display with runtime metrics.
|
||
|
||
Returns
|
||
-------
|
||
plan : str
|
||
"""
|
||
self._create_query_builders()
|
||
|
||
results = ["Vector Search Plan:"]
|
||
results.append(self._table._analyze_plan(self._vector_query.to_query_object()))
|
||
results.append("FTS Search Plan:")
|
||
results.append(self._table._analyze_plan(self._fts_query.to_query_object()))
|
||
return "\n".join(results)
|
||
|
||
def _create_query_builders(self):
|
||
"""Set up and configure the vector and FTS query builders."""
|
||
vector_query, fts_query = self._validate_query(
|
||
self._query, self._vector, self._text
|
||
)
|
||
self._fts_query = LanceFtsQueryBuilder(
|
||
self._table, fts_query, fts_columns=self._fts_columns
|
||
)
|
||
vector_query = self._query_to_vector(
|
||
self._table, vector_query, self._vector_column
|
||
)
|
||
self._vector_query = LanceVectorQueryBuilder(
|
||
self._table, vector_query, self._vector_column
|
||
)
|
||
|
||
# Apply common configurations
|
||
if self._limit:
|
||
self._vector_query.limit(self._limit)
|
||
self._fts_query.limit(self._limit)
|
||
if self._columns:
|
||
self._vector_query.select(self._columns)
|
||
self._fts_query.select(self._columns)
|
||
if self._where:
|
||
self._vector_query.where(self._where, not self._postfilter)
|
||
self._fts_query.where(self._where, not self._postfilter)
|
||
if self._with_row_id:
|
||
self._vector_query.with_row_id(True)
|
||
self._fts_query.with_row_id(True)
|
||
if self._phrase_query:
|
||
self._fts_query.phrase_query(True)
|
||
if self._distance_type:
|
||
self._vector_query.metric(self._distance_type)
|
||
if self._minimum_nprobes:
|
||
self._vector_query.minimum_nprobes(self._minimum_nprobes)
|
||
if self._maximum_nprobes is not None:
|
||
self._vector_query.maximum_nprobes(self._maximum_nprobes)
|
||
if self._refine_factor:
|
||
self._vector_query.refine_factor(self._refine_factor)
|
||
if self._ef:
|
||
self._vector_query.ef(self._ef)
|
||
if self._bypass_vector_index:
|
||
self._vector_query.bypass_vector_index()
|
||
if self._lower_bound or self._upper_bound:
|
||
self._vector_query.distance_range(
|
||
lower_bound=self._lower_bound, upper_bound=self._upper_bound
|
||
)
|
||
|
||
if self._reranker is None:
|
||
self._reranker = RRFReranker()
|
||
|
||
|
||
class AsyncQueryBase(object):
|
||
"""
|
||
Base class for all async queries (take, scan, vector, fts, hybrid)
|
||
"""
|
||
|
||
def __init__(self, inner: Union[LanceQuery, LanceVectorQuery, LanceTakeQuery]):
|
||
"""
|
||
Construct an AsyncQueryBase
|
||
|
||
This method is not intended to be called directly. Instead, use the
|
||
[AsyncTable.query][lancedb.table.AsyncTable.query] method to create a query.
|
||
"""
|
||
self._inner = inner
|
||
|
||
def to_query_object(self) -> Query:
|
||
"""
|
||
Convert the query into a query object
|
||
|
||
This is currently experimental but can be useful as the query object is pure
|
||
python and more easily serializable.
|
||
"""
|
||
return Query.from_inner(self._inner.to_query_request())
|
||
|
||
def select(self, columns: Union[List[str], dict[str, str]]) -> Self:
|
||
"""
|
||
Return only the specified columns.
|
||
|
||
By default a query will return all columns from the table. However, this can
|
||
have a very significant impact on latency. LanceDb stores data in a columnar
|
||
fashion. This
|
||
means we can finely tune our I/O to select exactly the columns we need.
|
||
|
||
As a best practice you should always limit queries to the columns that you need.
|
||
If you pass in a list of column names then only those columns will be
|
||
returned.
|
||
|
||
You can also use this method to create new "dynamic" columns based on your
|
||
existing columns. For example, you may not care about "a" or "b" but instead
|
||
simply want "a + b". This is often seen in the SELECT clause of an SQL query
|
||
(e.g. `SELECT a+b FROM my_table`).
|
||
|
||
To create dynamic columns you can pass in a dict[str, str]. A column will be
|
||
returned for each entry in the map. The key provides the name of the column.
|
||
The value is an SQL string used to specify how the column is calculated.
|
||
|
||
For example, an SQL query might state `SELECT a + b AS combined, c`. The
|
||
equivalent input to this method would be `{"combined": "a + b", "c": "c"}`.
|
||
|
||
Columns will always be returned in the order given, even if that order is
|
||
different than the order used when adding the data.
|
||
"""
|
||
if isinstance(columns, list) and all(isinstance(c, str) for c in columns):
|
||
self._inner.select_columns(columns)
|
||
elif isinstance(columns, dict) and all(
|
||
isinstance(k, str) and isinstance(v, str) for k, v in columns.items()
|
||
):
|
||
self._inner.select(list(columns.items()))
|
||
else:
|
||
raise TypeError("columns must be a list of column names or a dict")
|
||
return self
|
||
|
||
def with_row_id(self) -> Self:
|
||
"""
|
||
Include the _rowid column in the results.
|
||
"""
|
||
self._inner.with_row_id()
|
||
return self
|
||
|
||
async def to_batches(
|
||
self,
|
||
*,
|
||
max_batch_length: Optional[int] = None,
|
||
timeout: Optional[timedelta] = None,
|
||
) -> AsyncRecordBatchReader:
|
||
"""
|
||
Execute the query and return the results as an Apache Arrow RecordBatchReader.
|
||
|
||
Parameters
|
||
----------
|
||
|
||
max_batch_length: Optional[int]
|
||
The maximum number of selected records in a single RecordBatch object.
|
||
If not specified, a default batch length is used.
|
||
It is possible for batches to be smaller than the provided length if the
|
||
underlying data is stored in smaller chunks.
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If not specified, no timeout is applied. If the query does not
|
||
complete within the specified time, an error will be raised.
|
||
"""
|
||
return AsyncRecordBatchReader(
|
||
await self._inner.execute(
|
||
max_batch_length=max_batch_length, timeout=timeout
|
||
)
|
||
)
|
||
|
||
async def output_schema(self) -> pa.Schema:
|
||
"""
|
||
Return the output schema for the query
|
||
|
||
This does not execute the query.
|
||
"""
|
||
return await self._inner.output_schema()
|
||
|
||
async def to_arrow(self, timeout: Optional[timedelta] = None) -> pa.Table:
|
||
"""
|
||
Execute the query and collect the results into an Apache Arrow Table.
|
||
|
||
This method will collect all results into memory before returning. If
|
||
you expect a large number of results, you may want to use
|
||
[to_batches][lancedb.query.AsyncQueryBase.to_batches]
|
||
|
||
Parameters
|
||
----------
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If not specified, no timeout is applied. If the query does not
|
||
complete within the specified time, an error will be raised.
|
||
"""
|
||
batch_iter = await self.to_batches(timeout=timeout)
|
||
return pa.Table.from_batches(
|
||
await batch_iter.read_all(), schema=batch_iter.schema
|
||
)
|
||
|
||
async def to_list(self, timeout: Optional[timedelta] = None) -> List[dict]:
|
||
"""
|
||
Execute the query and return the results as a list of dictionaries.
|
||
|
||
Each list entry is a dictionary with the selected column names as keys,
|
||
or all table columns if `select` is not called. The vector and the "_distance"
|
||
fields are returned whether or not they're explicitly selected.
|
||
|
||
Parameters
|
||
----------
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If not specified, no timeout is applied. If the query does not
|
||
complete within the specified time, an error will be raised.
|
||
"""
|
||
return (await self.to_arrow(timeout=timeout)).to_pylist()
|
||
|
||
async def to_pandas(
|
||
self,
|
||
flatten: Optional[Union[int, bool]] = None,
|
||
timeout: Optional[timedelta] = None,
|
||
) -> "pd.DataFrame":
|
||
"""
|
||
Execute the query and collect the results into a pandas DataFrame.
|
||
|
||
This method will collect all results into memory before returning. If you
|
||
expect a large number of results, you may want to use
|
||
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
|
||
pandas separately.
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> import asyncio
|
||
>>> from lancedb import connect_async
|
||
>>> async def doctest_example():
|
||
... conn = await connect_async("./.lancedb")
|
||
... table = await conn.create_table("my_table", data=[{"a": 1, "b": 2}])
|
||
... async for batch in await table.query().to_batches():
|
||
... batch_df = batch.to_pandas()
|
||
>>> asyncio.run(doctest_example())
|
||
|
||
Parameters
|
||
----------
|
||
flatten: Optional[Union[int, bool]]
|
||
If flatten is True, flatten all nested columns.
|
||
If flatten is an integer, flatten the nested columns up to the
|
||
specified depth.
|
||
If unspecified, do not flatten the nested columns.
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If not specified, no timeout is applied. If the query does not
|
||
complete within the specified time, an error will be raised.
|
||
"""
|
||
return (
|
||
flatten_columns(await self.to_arrow(timeout=timeout), flatten)
|
||
).to_pandas()
|
||
|
||
async def to_polars(
|
||
self,
|
||
timeout: Optional[timedelta] = None,
|
||
) -> "pl.DataFrame":
|
||
"""
|
||
Execute the query and collect the results into a Polars DataFrame.
|
||
|
||
This method will collect all results into memory before returning. If you
|
||
expect a large number of results, you may want to use
|
||
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
|
||
polars separately.
|
||
|
||
Parameters
|
||
----------
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If not specified, no timeout is applied. If the query does not
|
||
complete within the specified time, an error will be raised.
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> import asyncio
|
||
>>> import polars as pl
|
||
>>> from lancedb import connect_async
|
||
>>> async def doctest_example():
|
||
... conn = await connect_async("./.lancedb")
|
||
... table = await conn.create_table("my_table", data=[{"a": 1, "b": 2}])
|
||
... async for batch in await table.query().to_batches():
|
||
... batch_df = pl.from_arrow(batch)
|
||
>>> asyncio.run(doctest_example())
|
||
"""
|
||
import polars as pl
|
||
|
||
return pl.from_arrow(await self.to_arrow(timeout=timeout))
|
||
|
||
async def to_pydantic(
|
||
self, model: Type[LanceModel], *, timeout: Optional[timedelta] = None
|
||
) -> List[LanceModel]:
|
||
"""
|
||
Convert results to a list of pydantic models.
|
||
|
||
Parameters
|
||
----------
|
||
model : Type[LanceModel]
|
||
The pydantic model to use.
|
||
timeout : timedelta, optional
|
||
The maximum time to wait for the query to complete.
|
||
If None, wait indefinitely.
|
||
|
||
Returns
|
||
-------
|
||
list[LanceModel]
|
||
"""
|
||
return [
|
||
model(**row) for row in (await self.to_arrow(timeout=timeout)).to_pylist()
|
||
]
|
||
|
||
async def explain_plan(self, verbose: Optional[bool] = False):
|
||
"""Return the execution plan for this query.
|
||
|
||
Examples
|
||
--------
|
||
>>> import asyncio
|
||
>>> from lancedb import connect_async
|
||
>>> async def doctest_example():
|
||
... conn = await connect_async("./.lancedb")
|
||
... table = await conn.create_table("my_table", [{"vector": [99.0, 99.0]}])
|
||
... plan = await table.query().nearest_to([1.0, 2.0]).explain_plan(True)
|
||
... print(plan)
|
||
>>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
||
GlobalLimitExec: skip=0, fetch=10
|
||
FilterExec: _distance@2 IS NOT NULL
|
||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
|
||
KNNVectorDistance: metric=l2
|
||
LanceRead: uri=..., projection=[vector], ...
|
||
<BLANKLINE>
|
||
|
||
Parameters
|
||
----------
|
||
verbose : bool, default False
|
||
Use a verbose output format.
|
||
|
||
Returns
|
||
-------
|
||
plan : str
|
||
""" # noqa: E501
|
||
return await self._inner.explain_plan(verbose)
|
||
|
||
async def analyze_plan(self):
|
||
"""Execute the query and display with runtime metrics.
|
||
|
||
Returns
|
||
-------
|
||
plan : str
|
||
"""
|
||
return await self._inner.analyze_plan()
|
||
|
||
|
||
class AsyncStandardQuery(AsyncQueryBase):
|
||
"""
|
||
Base class for "standard" async queries (all but take currently)
|
||
"""
|
||
|
||
def __init__(self, inner: Union[LanceQuery, LanceVectorQuery]):
|
||
"""
|
||
Construct an AsyncStandardQuery
|
||
|
||
This method is not intended to be called directly. Instead, use the
|
||
[AsyncTable.query][lancedb.table.AsyncTable.query] method to create a query.
|
||
"""
|
||
super().__init__(inner)
|
||
|
||
def where(self, predicate: str) -> Self:
|
||
"""
|
||
Only return rows matching the given predicate
|
||
|
||
The predicate should be supplied as an SQL query string.
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> predicate = "x > 10"
|
||
>>> predicate = "y > 0 AND y < 100"
|
||
>>> predicate = "x > 5 OR y = 'test'"
|
||
|
||
Filtering performance can often be improved by creating a scalar index
|
||
on the filter column(s).
|
||
"""
|
||
self._inner.where(predicate)
|
||
return self
|
||
|
||
def limit(self, limit: int) -> Self:
|
||
"""
|
||
Set the maximum number of results to return.
|
||
|
||
By default, a plain search has no limit. If this method is not
|
||
called then every valid row from the table will be returned.
|
||
"""
|
||
self._inner.limit(limit)
|
||
return self
|
||
|
||
def offset(self, offset: int) -> Self:
|
||
"""
|
||
Set the offset for the results.
|
||
|
||
Parameters
|
||
----------
|
||
offset: int
|
||
The offset to start fetching results from.
|
||
"""
|
||
self._inner.offset(offset)
|
||
return self
|
||
|
||
def fast_search(self) -> Self:
|
||
"""
|
||
Skip searching un-indexed data.
|
||
|
||
This can make queries faster, but will miss any data that has not been
|
||
indexed.
|
||
|
||
!!! tip
|
||
You can add new data into an existing index by calling
|
||
[AsyncTable.optimize][lancedb.table.AsyncTable.optimize].
|
||
"""
|
||
self._inner.fast_search()
|
||
return self
|
||
|
||
def postfilter(self) -> Self:
|
||
"""
|
||
If this is called then filtering will happen after the search instead of
|
||
before.
|
||
By default filtering will be performed before the search. This is how
|
||
filtering is typically understood to work. This prefilter step does add some
|
||
additional latency. Creating a scalar index on the filter column(s) can
|
||
often improve this latency. However, sometimes a filter is too complex or
|
||
scalar indices cannot be applied to the column. In these cases postfiltering
|
||
can be used instead of prefiltering to improve latency.
|
||
Post filtering applies the filter to the results of the search. This
|
||
means we only run the filter on a much smaller set of data. However, it can
|
||
cause the query to return fewer than `limit` results (or even no results) if
|
||
none of the nearest results match the filter.
|
||
Post filtering happens during the "refine stage" (described in more detail in
|
||
@see {@link VectorQuery#refineFactor}). This means that setting a higher refine
|
||
factor can often help restore some of the results lost by post filtering.
|
||
"""
|
||
self._inner.postfilter()
|
||
return self
|
||
|
||
|
||
class AsyncQuery(AsyncStandardQuery):
|
||
def __init__(self, inner: LanceQuery):
|
||
"""
|
||
Construct an AsyncQuery
|
||
|
||
This method is not intended to be called directly. Instead, use the
|
||
[AsyncTable.query][lancedb.table.AsyncTable.query] method to create a query.
|
||
"""
|
||
super().__init__(inner)
|
||
self._inner = inner
|
||
|
||
@classmethod
|
||
def _query_vec_to_array(self, vec: Union[VEC, Tuple]):
|
||
if isinstance(vec, list):
|
||
return pa.array(vec)
|
||
if isinstance(vec, np.ndarray):
|
||
return pa.array(vec)
|
||
if isinstance(vec, pa.Array):
|
||
return vec
|
||
if isinstance(vec, pa.ChunkedArray):
|
||
return vec.combine_chunks()
|
||
if isinstance(vec, tuple):
|
||
return pa.array(vec)
|
||
# We've checked everything we formally support in our typings
|
||
# but, as a fallback, let pyarrow try and convert it anyway.
|
||
# This can allow for some more exotic things like iterables
|
||
return pa.array(vec)
|
||
|
||
def nearest_to(
|
||
self,
|
||
query_vector: Union[VEC, Tuple, List[VEC]],
|
||
) -> AsyncVectorQuery:
|
||
"""
|
||
Find the nearest vectors to the given query vector.
|
||
|
||
This converts the query from a plain query to a vector query.
|
||
|
||
This method will attempt to convert the input to the query vector
|
||
expected by the embedding model. If the input cannot be converted
|
||
then an error will be thrown.
|
||
|
||
By default, there is no embedding model, and the input should be
|
||
something that can be converted to a pyarrow array of floats. This
|
||
includes lists, numpy arrays, and tuples.
|
||
|
||
If there is only one vector column (a column whose data type is a
|
||
fixed size list of floats) then the column does not need to be specified.
|
||
If there is more than one vector column you must use
|
||
[AsyncVectorQuery.column][lancedb.query.AsyncVectorQuery.column] to specify
|
||
which column you would like to compare with.
|
||
|
||
If no index has been created on the vector column then a vector query
|
||
will perform a distance comparison between the query vector and every
|
||
vector in the database and then sort the results. This is sometimes
|
||
called a "flat search"
|
||
|
||
For small databases, with tens of thousands of vectors or less, this can
|
||
be reasonably fast. In larger databases you should create a vector index
|
||
on the column. If there is a vector index then an "approximate" nearest
|
||
neighbor search (frequently called an ANN search) will be performed. This
|
||
search is much faster, but the results will be approximate.
|
||
|
||
The query can be further parameterized using the returned builder. There
|
||
are various ANN search parameters that will let you fine tune your recall
|
||
accuracy vs search latency.
|
||
|
||
Vector searches always have a [limit][]. If `limit` has not been called then
|
||
a default `limit` of 10 will be used.
|
||
|
||
Typically, a single vector is passed in as the query. However, you can also
|
||
pass in multiple vectors. When multiple vectors are passed in, if the vector
|
||
column is with multivector type, then the vectors will be treated as a single
|
||
query. Or the vectors will be treated as multiple queries, this can be useful
|
||
if you want to find the nearest vectors to multiple query vectors.
|
||
This is not expected to be faster than making multiple queries concurrently;
|
||
it is just a convenience method. If multiple vectors are passed in then
|
||
an additional column `query_index` will be added to the results. This column
|
||
will contain the index of the query vector that the result is nearest to.
|
||
"""
|
||
if query_vector is None:
|
||
raise ValueError("query_vector can not be None")
|
||
|
||
if (
|
||
isinstance(query_vector, (list, np.ndarray, pa.Array))
|
||
and len(query_vector) > 0
|
||
and isinstance(query_vector[0], (list, np.ndarray, pa.Array))
|
||
):
|
||
# multiple have been passed
|
||
query_vectors = [AsyncQuery._query_vec_to_array(v) for v in query_vector]
|
||
new_self = self._inner.nearest_to(query_vectors[0])
|
||
for v in query_vectors[1:]:
|
||
new_self.add_query_vector(v)
|
||
return AsyncVectorQuery(new_self)
|
||
else:
|
||
return AsyncVectorQuery(
|
||
self._inner.nearest_to(AsyncQuery._query_vec_to_array(query_vector))
|
||
)
|
||
|
||
def nearest_to_text(
|
||
self, query: str | FullTextQuery, columns: Union[str, List[str], None] = None
|
||
) -> AsyncFTSQuery:
|
||
"""
|
||
Find the documents that are most relevant to the given text query.
|
||
|
||
This method will perform a full text search on the table and return
|
||
the most relevant documents. The relevance is determined by BM25.
|
||
|
||
The columns to search must be with native FTS index
|
||
(Tantivy-based can't work with this method).
|
||
|
||
By default, all indexed columns are searched,
|
||
now only one column can be searched at a time.
|
||
|
||
Parameters
|
||
----------
|
||
query: str
|
||
The text query to search for.
|
||
columns: str or list of str, default None
|
||
The columns to search in. If None, all indexed columns are searched.
|
||
For now only one column can be searched at a time.
|
||
"""
|
||
if isinstance(columns, str):
|
||
columns = [columns]
|
||
if columns is None:
|
||
columns = []
|
||
|
||
if isinstance(query, str):
|
||
return AsyncFTSQuery(
|
||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||
)
|
||
# FullTextQuery object
|
||
return AsyncFTSQuery(self._inner.nearest_to_text({"query": query}))
|
||
|
||
|
||
class AsyncFTSQuery(AsyncStandardQuery):
|
||
"""A query for full text search for LanceDB."""
|
||
|
||
def __init__(self, inner: LanceFTSQuery):
|
||
super().__init__(inner)
|
||
self._inner = inner
|
||
self._reranker = None
|
||
|
||
def get_query(self) -> str:
|
||
return self._inner.get_query()
|
||
|
||
def rerank(
|
||
self,
|
||
reranker: Reranker = RRFReranker(),
|
||
) -> AsyncFTSQuery:
|
||
if reranker and not isinstance(reranker, Reranker):
|
||
raise ValueError("reranker must be an instance of Reranker class.")
|
||
|
||
self._reranker = reranker
|
||
|
||
return self
|
||
|
||
def nearest_to(
|
||
self,
|
||
query_vector: Union[VEC, Tuple, List[VEC]],
|
||
) -> AsyncHybridQuery:
|
||
"""
|
||
In addition doing text search on the LanceDB Table, also
|
||
find the nearest vectors to the given query vector.
|
||
|
||
This converts the query from a FTS Query to a Hybrid query. Results
|
||
from the vector search will be combined with results from the FTS query.
|
||
|
||
This method will attempt to convert the input to the query vector
|
||
expected by the embedding model. If the input cannot be converted
|
||
then an error will be thrown.
|
||
|
||
By default, there is no embedding model, and the input should be
|
||
something that can be converted to a pyarrow array of floats. This
|
||
includes lists, numpy arrays, and tuples.
|
||
|
||
If there is only one vector column (a column whose data type is a
|
||
fixed size list of floats) then the column does not need to be specified.
|
||
If there is more than one vector column you must use
|
||
[AsyncVectorQuery.column][lancedb.query.AsyncVectorQuery.column] to specify
|
||
which column you would like to compare with.
|
||
|
||
If no index has been created on the vector column then a vector query
|
||
will perform a distance comparison between the query vector and every
|
||
vector in the database and then sort the results. This is sometimes
|
||
called a "flat search"
|
||
|
||
For small databases, with tens of thousands of vectors or less, this can
|
||
be reasonably fast. In larger databases you should create a vector index
|
||
on the column. If there is a vector index then an "approximate" nearest
|
||
neighbor search (frequently called an ANN search) will be performed. This
|
||
search is much faster, but the results will be approximate.
|
||
|
||
The query can be further parameterized using the returned builder. There
|
||
are various ANN search parameters that will let you fine tune your recall
|
||
accuracy vs search latency.
|
||
|
||
Hybrid searches always have a [limit][]. If `limit` has not been called then
|
||
a default `limit` of 10 will be used.
|
||
|
||
Typically, a single vector is passed in as the query. However, you can also
|
||
pass in multiple vectors. This can be useful if you want to find the nearest
|
||
vectors to multiple query vectors. This is not expected to be faster than
|
||
making multiple queries concurrently; it is just a convenience method.
|
||
If multiple vectors are passed in then an additional column `query_index`
|
||
will be added to the results. This column will contain the index of the
|
||
query vector that the result is nearest to.
|
||
"""
|
||
if query_vector is None:
|
||
raise ValueError("query_vector can not be None")
|
||
|
||
if (
|
||
isinstance(query_vector, list)
|
||
and len(query_vector) > 0
|
||
and not isinstance(query_vector[0], (float, int))
|
||
):
|
||
# multiple have been passed
|
||
query_vectors = [AsyncQuery._query_vec_to_array(v) for v in query_vector]
|
||
new_self = self._inner.nearest_to(query_vectors[0])
|
||
for v in query_vectors[1:]:
|
||
new_self.add_query_vector(v)
|
||
return AsyncHybridQuery(new_self)
|
||
else:
|
||
return AsyncHybridQuery(
|
||
self._inner.nearest_to(AsyncQuery._query_vec_to_array(query_vector))
|
||
)
|
||
|
||
async def to_batches(
|
||
self,
|
||
*,
|
||
max_batch_length: Optional[int] = None,
|
||
timeout: Optional[timedelta] = None,
|
||
) -> AsyncRecordBatchReader:
|
||
reader = await super().to_batches(timeout=timeout)
|
||
results = pa.Table.from_batches(await reader.read_all(), reader.schema)
|
||
if self._reranker:
|
||
results = self._reranker.rerank_fts(self.get_query(), results)
|
||
return AsyncRecordBatchReader(results, max_batch_length=max_batch_length)
|
||
|
||
|
||
class AsyncVectorQueryBase:
|
||
def column(self, column: str) -> Self:
|
||
"""
|
||
Set the vector column to query
|
||
|
||
This controls which column is compared to the query vector supplied in
|
||
the call to [AsyncQuery.nearest_to][lancedb.query.AsyncQuery.nearest_to].
|
||
|
||
This parameter must be specified if the table has more than one column
|
||
whose data type is a fixed-size-list of floats.
|
||
"""
|
||
self._inner.column(column)
|
||
return self
|
||
|
||
def nprobes(self, nprobes: int) -> Self:
|
||
"""
|
||
Set the number of partitions to search (probe)
|
||
|
||
This argument is only used when the vector column has an IVF-based index.
|
||
If there is no index then this value is ignored.
|
||
|
||
The IVF stage of IVF PQ divides the input into partitions (clusters) of
|
||
related values.
|
||
|
||
The partition whose centroids are closest to the query vector will be
|
||
exhaustiely searched to find matches. This parameter controls how many
|
||
partitions should be searched.
|
||
|
||
Increasing this value will increase the recall of your query but will
|
||
also increase the latency of your query. The default value is 20. This
|
||
default is good for many cases but the best value to use will depend on
|
||
your data and the recall that you need to achieve.
|
||
|
||
For best results we recommend tuning this parameter with a benchmark against
|
||
your actual data to find the smallest possible value that will still give
|
||
you the desired recall.
|
||
"""
|
||
self._inner.nprobes(nprobes)
|
||
return self
|
||
|
||
def minimum_nprobes(self, minimum_nprobes: int) -> Self:
|
||
"""Set the minimum number of probes to use.
|
||
|
||
See `nprobes` for more details.
|
||
|
||
These partitions will be searched on every indexed vector query and will
|
||
increase recall at the expense of latency.
|
||
"""
|
||
self._inner.minimum_nprobes(minimum_nprobes)
|
||
return self
|
||
|
||
def maximum_nprobes(self, maximum_nprobes: int) -> Self:
|
||
"""Set the maximum number of probes to use.
|
||
|
||
See `nprobes` for more details.
|
||
|
||
If this value is greater than `minimum_nprobes` then the excess partitions
|
||
will be searched only if we have not found enough results.
|
||
|
||
This can be useful when there is a narrow filter to allow these queries to
|
||
spend more time searching and avoid potential false negatives.
|
||
|
||
If this value is 0 then no limit will be applied and all partitions could be
|
||
searched if needed to satisfy the limit.
|
||
"""
|
||
self._inner.maximum_nprobes(maximum_nprobes)
|
||
return self
|
||
|
||
def distance_range(
|
||
self, lower_bound: Optional[float] = None, upper_bound: Optional[float] = None
|
||
) -> Self:
|
||
"""Set the distance range to use.
|
||
|
||
Only rows with distances within range [lower_bound, upper_bound)
|
||
will be returned.
|
||
|
||
Parameters
|
||
----------
|
||
lower_bound: Optional[float]
|
||
The lower bound of the distance range.
|
||
upper_bound: Optional[float]
|
||
The upper bound of the distance range.
|
||
|
||
Returns
|
||
-------
|
||
AsyncVectorQuery
|
||
The AsyncVectorQuery object.
|
||
"""
|
||
self._inner.distance_range(lower_bound, upper_bound)
|
||
return self
|
||
|
||
def ef(self, ef: int) -> Self:
|
||
"""
|
||
Set the number of candidates to consider during search
|
||
|
||
This argument is only used when the vector column has an HNSW index.
|
||
If there is no index then this value is ignored.
|
||
|
||
Increasing this value will increase the recall of your query but will also
|
||
increase the latency of your query. The default value is 1.5 * limit. This
|
||
default is good for many cases but the best value to use will depend on your
|
||
data and the recall that you need to achieve.
|
||
"""
|
||
self._inner.ef(ef)
|
||
return self
|
||
|
||
def refine_factor(self, refine_factor: int) -> Self:
|
||
"""
|
||
A multiplier to control how many additional rows are taken during the refine
|
||
step
|
||
|
||
This argument is only used when the vector column has an IVF PQ index.
|
||
If there is no index then this value is ignored.
|
||
|
||
An IVF PQ index stores compressed (quantized) values. They query vector is
|
||
compared against these values and, since they are compressed, the comparison is
|
||
inaccurate.
|
||
|
||
This parameter can be used to refine the results. It can improve both improve
|
||
recall and correct the ordering of the nearest results.
|
||
|
||
To refine results LanceDb will first perform an ANN search to find the nearest
|
||
`limit` * `refine_factor` results. In other words, if `refine_factor` is 3 and
|
||
`limit` is the default (10) then the first 30 results will be selected. LanceDb
|
||
then fetches the full, uncompressed, values for these 30 results. The results
|
||
are then reordered by the true distance and only the nearest 10 are kept.
|
||
|
||
Note: there is a difference between calling this method with a value of 1 and
|
||
never calling this method at all. Calling this method with any value will have
|
||
an impact on your search latency. When you call this method with a
|
||
`refine_factor` of 1 then LanceDb still needs to fetch the full, uncompressed,
|
||
values so that it can potentially reorder the results.
|
||
|
||
Note: if this method is NOT called then the distances returned in the _distance
|
||
column will be approximate distances based on the comparison of the quantized
|
||
query vector and the quantized result vectors. This can be considerably
|
||
different than the true distance between the query vector and the actual
|
||
uncompressed vector.
|
||
"""
|
||
self._inner.refine_factor(refine_factor)
|
||
return self
|
||
|
||
def distance_type(self, distance_type: str) -> Self:
|
||
"""
|
||
Set the distance metric to use
|
||
|
||
When performing a vector search we try and find the "nearest" vectors according
|
||
to some kind of distance metric. This parameter controls which distance metric
|
||
to use. See @see {@link IvfPqOptions.distanceType} for more details on the
|
||
different distance metrics available.
|
||
|
||
Note: if there is a vector index then the distance type used MUST match the
|
||
distance type used to train the vector index. If this is not done then the
|
||
results will be invalid.
|
||
|
||
By default "l2" is used.
|
||
"""
|
||
self._inner.distance_type(distance_type)
|
||
return self
|
||
|
||
def bypass_vector_index(self) -> Self:
|
||
"""
|
||
If this is called then any vector index is skipped
|
||
|
||
An exhaustive (flat) search will be performed. The query vector will
|
||
be compared to every vector in the table. At high scales this can be
|
||
expensive. However, this is often still useful. For example, skipping
|
||
the vector index can give you ground truth results which you can use to
|
||
calculate your recall to select an appropriate value for nprobes.
|
||
"""
|
||
self._inner.bypass_vector_index()
|
||
return self
|
||
|
||
|
||
class AsyncVectorQuery(AsyncStandardQuery, AsyncVectorQueryBase):
|
||
def __init__(self, inner: LanceVectorQuery):
|
||
"""
|
||
Construct an AsyncVectorQuery
|
||
|
||
This method is not intended to be called directly. Instead, create
|
||
a query first with [AsyncTable.query][lancedb.table.AsyncTable.query] and then
|
||
use [AsyncQuery.nearest_to][lancedb.query.AsyncQuery.nearest_to]] to convert to
|
||
a vector query. Or you can use
|
||
[AsyncTable.vector_search][lancedb.table.AsyncTable.vector_search]
|
||
"""
|
||
super().__init__(inner)
|
||
self._inner = inner
|
||
self._reranker = None
|
||
self._query_string = None
|
||
|
||
def rerank(
|
||
self, reranker: Reranker = RRFReranker(), query_string: Optional[str] = None
|
||
) -> AsyncHybridQuery:
|
||
if reranker and not isinstance(reranker, Reranker):
|
||
raise ValueError("reranker must be an instance of Reranker class.")
|
||
|
||
self._reranker = reranker
|
||
|
||
if not self._query_string and not query_string:
|
||
raise ValueError("query_string must be provided to rerank the results.")
|
||
|
||
self._query_string = query_string
|
||
|
||
return self
|
||
|
||
def nearest_to_text(
|
||
self, query: str | FullTextQuery, columns: Union[str, List[str], None] = None
|
||
) -> AsyncHybridQuery:
|
||
"""
|
||
Find the documents that are most relevant to the given text query,
|
||
in addition to vector search.
|
||
|
||
This converts the vector query into a hybrid query.
|
||
|
||
This search will perform a full text search on the table and return
|
||
the most relevant documents, combined with the vector query results.
|
||
The text relevance is determined by BM25.
|
||
|
||
The columns to search must be with native FTS index
|
||
(Tantivy-based can't work with this method).
|
||
|
||
By default, all indexed columns are searched,
|
||
now only one column can be searched at a time.
|
||
|
||
Parameters
|
||
----------
|
||
query: str
|
||
The text query to search for.
|
||
columns: str or list of str, default None
|
||
The columns to search in. If None, all indexed columns are searched.
|
||
For now only one column can be searched at a time.
|
||
"""
|
||
if isinstance(columns, str):
|
||
columns = [columns]
|
||
if columns is None:
|
||
columns = []
|
||
|
||
if isinstance(query, str):
|
||
return AsyncHybridQuery(
|
||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||
)
|
||
# FullTextQuery object
|
||
return AsyncHybridQuery(self._inner.nearest_to_text({"query": query}))
|
||
|
||
async def to_batches(
|
||
self,
|
||
*,
|
||
max_batch_length: Optional[int] = None,
|
||
timeout: Optional[timedelta] = None,
|
||
) -> AsyncRecordBatchReader:
|
||
reader = await super().to_batches(timeout=timeout)
|
||
results = pa.Table.from_batches(await reader.read_all(), reader.schema)
|
||
if self._reranker:
|
||
results = self._reranker.rerank_vector(self._query_string, results)
|
||
return AsyncRecordBatchReader(results, max_batch_length=max_batch_length)
|
||
|
||
|
||
class AsyncHybridQuery(AsyncStandardQuery, AsyncVectorQueryBase):
|
||
"""
|
||
A query builder that performs hybrid vector and full text search.
|
||
Results are combined and reranked based on the specified reranker.
|
||
By default, the results are reranked using the RRFReranker, which
|
||
uses reciprocal rank fusion score for reranking.
|
||
|
||
To make the vector and fts results comparable, the scores are normalized.
|
||
Instead of normalizing scores, the `normalize` parameter can be set to "rank"
|
||
in the `rerank` method to convert the scores to ranks and then normalize them.
|
||
"""
|
||
|
||
def __init__(self, inner: LanceHybridQuery):
|
||
super().__init__(inner)
|
||
self._inner = inner
|
||
self._norm = "score"
|
||
self._reranker = RRFReranker()
|
||
|
||
def rerank(
|
||
self, reranker: Reranker = RRFReranker(), normalize: str = "score"
|
||
) -> AsyncHybridQuery:
|
||
"""
|
||
Rerank the hybrid search results using the specified reranker. The reranker
|
||
must be an instance of Reranker class.
|
||
|
||
Parameters
|
||
----------
|
||
reranker: Reranker, default RRFReranker()
|
||
The reranker to use. Must be an instance of Reranker class.
|
||
normalize: str, default "score"
|
||
The method to normalize the scores. Can be "rank" or "score". If "rank",
|
||
the scores are converted to ranks and then normalized. If "score", the
|
||
scores are normalized directly.
|
||
Returns
|
||
-------
|
||
AsyncHybridQuery
|
||
The AsyncHybridQuery object.
|
||
"""
|
||
if normalize not in ["rank", "score"]:
|
||
raise ValueError("normalize must be 'rank' or 'score'.")
|
||
if reranker and not isinstance(reranker, Reranker):
|
||
raise ValueError("reranker must be an instance of Reranker class.")
|
||
|
||
self._norm = normalize
|
||
self._reranker = reranker
|
||
|
||
return self
|
||
|
||
async def to_batches(
|
||
self,
|
||
*,
|
||
max_batch_length: Optional[int] = None,
|
||
timeout: Optional[timedelta] = None,
|
||
) -> AsyncRecordBatchReader:
|
||
fts_query = AsyncFTSQuery(self._inner.to_fts_query())
|
||
vec_query = AsyncVectorQuery(self._inner.to_vector_query())
|
||
|
||
# save the row ID choice that was made on the query builder and force it
|
||
# to actually fetch the row ids because we need this for reranking
|
||
with_row_ids = self._inner.get_with_row_id()
|
||
fts_query.with_row_id()
|
||
vec_query.with_row_id()
|
||
|
||
fts_results, vector_results = await asyncio.gather(
|
||
fts_query.to_arrow(timeout=timeout),
|
||
vec_query.to_arrow(timeout=timeout),
|
||
)
|
||
|
||
result = LanceHybridQueryBuilder._combine_hybrid_results(
|
||
fts_results=fts_results,
|
||
vector_results=vector_results,
|
||
norm=self._norm,
|
||
fts_query=fts_query.get_query(),
|
||
reranker=self._reranker,
|
||
limit=self._inner.get_limit(),
|
||
with_row_ids=with_row_ids,
|
||
)
|
||
|
||
return AsyncRecordBatchReader(result, max_batch_length=max_batch_length)
|
||
|
||
async def explain_plan(self, verbose: Optional[bool] = False):
|
||
"""Return the execution plan for this query.
|
||
|
||
The output includes both the vector and FTS search plans.
|
||
|
||
Examples
|
||
--------
|
||
>>> import asyncio
|
||
>>> from lancedb import connect_async
|
||
>>> from lancedb.index import FTS
|
||
>>> async def doctest_example():
|
||
... conn = await connect_async("./.lancedb")
|
||
... table = await conn.create_table("my_table", [{"vector": [99.0, 99.0], "text": "hello world"}])
|
||
... await table.create_index("text", config=FTS(with_position=False))
|
||
... plan = await table.query().nearest_to([1.0, 2.0]).nearest_to_text("hello").explain_plan(True)
|
||
... print(plan)
|
||
>>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||
RRFReranker(K=60)
|
||
ProjectionExec: expr=[vector@0 as vector, text@3 as text, _distance@2 as _distance]
|
||
Take: columns="vector, _rowid, _distance, (text)"
|
||
CoalesceBatchesExec: target_batch_size=1024
|
||
GlobalLimitExec: skip=0, fetch=10
|
||
FilterExec: _distance@2 IS NOT NULL
|
||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
|
||
KNNVectorDistance: metric=l2
|
||
LanceRead: uri=..., projection=[vector], ...
|
||
ProjectionExec: expr=[vector@2 as vector, text@3 as text, _score@1 as _score]
|
||
Take: columns="_rowid, _score, (vector), (text)"
|
||
CoalesceBatchesExec: target_batch_size=1024
|
||
GlobalLimitExec: skip=0, fetch=10
|
||
MatchQuery: column=text, query=hello
|
||
|
||
Parameters
|
||
----------
|
||
verbose : bool, default False
|
||
Use a verbose output format.
|
||
|
||
Returns
|
||
-------
|
||
plan : str
|
||
""" # noqa: E501
|
||
|
||
vector_plan = await self._inner.to_vector_query().explain_plan(verbose)
|
||
fts_plan = await self._inner.to_fts_query().explain_plan(verbose)
|
||
# Indent sub-plans under the reranker
|
||
indented_vector = "\n".join(" " + line for line in vector_plan.splitlines())
|
||
indented_fts = "\n".join(" " + line for line in fts_plan.splitlines())
|
||
return f"{self._reranker}\n {indented_vector}\n {indented_fts}"
|
||
|
||
async def analyze_plan(self):
|
||
"""
|
||
Execute the query and return the physical execution plan with runtime metrics.
|
||
|
||
This runs both the vector and FTS (full-text search) queries and returns
|
||
detailed metrics for each step of execution—such as rows processed,
|
||
elapsed time, I/O stats, and more. It’s useful for debugging and
|
||
performance analysis.
|
||
|
||
Returns
|
||
-------
|
||
plan : str
|
||
"""
|
||
results = ["Vector Search Query:"]
|
||
results.append(await self._inner.to_vector_query().analyze_plan())
|
||
results.append("FTS Search Query:")
|
||
results.append(await self._inner.to_fts_query().analyze_plan())
|
||
|
||
return "\n".join(results)
|
||
|
||
|
||
class AsyncTakeQuery(AsyncQueryBase):
|
||
"""
|
||
Builder for parameterizing and executing take queries.
|
||
"""
|
||
|
||
def __init__(self, inner: LanceTakeQuery):
|
||
super().__init__(inner)
|
||
|
||
|
||
class BaseQueryBuilder(object):
|
||
"""
|
||
Wraps AsyncQueryBase and provides a synchronous interface
|
||
"""
|
||
|
||
def __init__(self, inner: AsyncQueryBase):
|
||
self._inner = inner
|
||
|
||
def to_query_object(self) -> Query:
|
||
return self._inner.to_query_object()
|
||
|
||
def select(self, columns: Union[List[str], dict[str, str]]) -> Self:
|
||
"""
|
||
Return only the specified columns.
|
||
|
||
By default a query will return all columns from the table. However, this can
|
||
have a very significant impact on latency. LanceDb stores data in a columnar
|
||
fashion. This
|
||
means we can finely tune our I/O to select exactly the columns we need.
|
||
|
||
As a best practice you should always limit queries to the columns that you need.
|
||
If you pass in a list of column names then only those columns will be
|
||
returned.
|
||
|
||
You can also use this method to create new "dynamic" columns based on your
|
||
existing columns. For example, you may not care about "a" or "b" but instead
|
||
simply want "a + b". This is often seen in the SELECT clause of an SQL query
|
||
(e.g. `SELECT a+b FROM my_table`).
|
||
|
||
To create dynamic columns you can pass in a dict[str, str]. A column will be
|
||
returned for each entry in the map. The key provides the name of the column.
|
||
The value is an SQL string used to specify how the column is calculated.
|
||
|
||
For example, an SQL query might state `SELECT a + b AS combined, c`. The
|
||
equivalent input to this method would be `{"combined": "a + b", "c": "c"}`.
|
||
|
||
Columns will always be returned in the order given, even if that order is
|
||
different than the order used when adding the data.
|
||
"""
|
||
self._inner.select(columns)
|
||
return self
|
||
|
||
def with_row_id(self) -> Self:
|
||
"""
|
||
Include the _rowid column in the results.
|
||
"""
|
||
self._inner.with_row_id()
|
||
return self
|
||
|
||
def output_schema(self) -> pa.Schema:
|
||
"""
|
||
Return the output schema for the query
|
||
|
||
This does not execute the query.
|
||
"""
|
||
return LOOP.run(self._inner.output_schema())
|
||
|
||
def to_batches(
|
||
self,
|
||
*,
|
||
max_batch_length: Optional[int] = None,
|
||
timeout: Optional[timedelta] = None,
|
||
) -> pa.RecordBatchReader:
|
||
"""
|
||
Execute the query and return the results as an Apache Arrow RecordBatchReader.
|
||
|
||
Parameters
|
||
----------
|
||
|
||
max_batch_length: Optional[int]
|
||
The maximum number of selected records in a single RecordBatch object.
|
||
If not specified, a default batch length is used.
|
||
It is possible for batches to be smaller than the provided length if the
|
||
underlying data is stored in smaller chunks.
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If not specified, no timeout is applied. If the query does not
|
||
complete within the specified time, an error will be raised.
|
||
"""
|
||
async_iter = LOOP.run(self._inner.execute(max_batch_length, timeout))
|
||
|
||
def iter_sync():
|
||
try:
|
||
while True:
|
||
yield LOOP.run(async_iter.__anext__())
|
||
except StopAsyncIteration:
|
||
return
|
||
|
||
return pa.RecordBatchReader.from_batches(async_iter.schema, iter_sync())
|
||
|
||
def to_arrow(self, timeout: Optional[timedelta] = None) -> pa.Table:
|
||
"""
|
||
Execute the query and collect the results into an Apache Arrow Table.
|
||
|
||
This method will collect all results into memory before returning. If
|
||
you expect a large number of results, you may want to use
|
||
[to_batches][lancedb.query.AsyncQueryBase.to_batches]
|
||
|
||
Parameters
|
||
----------
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If not specified, no timeout is applied. If the query does not
|
||
complete within the specified time, an error will be raised.
|
||
"""
|
||
return LOOP.run(self._inner.to_arrow(timeout))
|
||
|
||
def to_list(self, timeout: Optional[timedelta] = None) -> List[dict]:
|
||
"""
|
||
Execute the query and return the results as a list of dictionaries.
|
||
|
||
Each list entry is a dictionary with the selected column names as keys,
|
||
or all table columns if `select` is not called. The vector and the "_distance"
|
||
fields are returned whether or not they're explicitly selected.
|
||
|
||
Parameters
|
||
----------
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If not specified, no timeout is applied. If the query does not
|
||
complete within the specified time, an error will be raised.
|
||
"""
|
||
return LOOP.run(self._inner.to_list(timeout))
|
||
|
||
def to_pandas(
|
||
self,
|
||
flatten: Optional[Union[int, bool]] = None,
|
||
timeout: Optional[timedelta] = None,
|
||
) -> "pd.DataFrame":
|
||
"""
|
||
Execute the query and collect the results into a pandas DataFrame.
|
||
|
||
This method will collect all results into memory before returning. If you
|
||
expect a large number of results, you may want to use
|
||
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
|
||
pandas separately.
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> import asyncio
|
||
>>> from lancedb import connect_async
|
||
>>> async def doctest_example():
|
||
... conn = await connect_async("./.lancedb")
|
||
... table = await conn.create_table("my_table", data=[{"a": 1, "b": 2}])
|
||
... async for batch in await table.query().to_batches():
|
||
... batch_df = batch.to_pandas()
|
||
>>> asyncio.run(doctest_example())
|
||
|
||
Parameters
|
||
----------
|
||
flatten: Optional[Union[int, bool]]
|
||
If flatten is True, flatten all nested columns.
|
||
If flatten is an integer, flatten the nested columns up to the
|
||
specified depth.
|
||
If unspecified, do not flatten the nested columns.
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If not specified, no timeout is applied. If the query does not
|
||
complete within the specified time, an error will be raised.
|
||
"""
|
||
return LOOP.run(self._inner.to_pandas(flatten, timeout))
|
||
|
||
def to_polars(
|
||
self,
|
||
timeout: Optional[timedelta] = None,
|
||
) -> "pl.DataFrame":
|
||
"""
|
||
Execute the query and collect the results into a Polars DataFrame.
|
||
|
||
This method will collect all results into memory before returning. If you
|
||
expect a large number of results, you may want to use
|
||
[to_batches][lancedb.query.AsyncQueryBase.to_batches] and convert each batch to
|
||
polars separately.
|
||
|
||
Parameters
|
||
----------
|
||
timeout: Optional[timedelta]
|
||
The maximum time to wait for the query to complete.
|
||
If not specified, no timeout is applied. If the query does not
|
||
complete within the specified time, an error will be raised.
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> import asyncio
|
||
>>> import polars as pl
|
||
>>> from lancedb import connect_async
|
||
>>> async def doctest_example():
|
||
... conn = await connect_async("./.lancedb")
|
||
... table = await conn.create_table("my_table", data=[{"a": 1, "b": 2}])
|
||
... async for batch in await table.query().to_batches():
|
||
... batch_df = pl.from_arrow(batch)
|
||
>>> asyncio.run(doctest_example())
|
||
"""
|
||
return LOOP.run(self._inner.to_polars(timeout))
|
||
|
||
def explain_plan(self, verbose: Optional[bool] = False):
|
||
"""Return the execution plan for this query.
|
||
|
||
Examples
|
||
--------
|
||
>>> import asyncio
|
||
>>> from lancedb import connect_async
|
||
>>> async def doctest_example():
|
||
... conn = await connect_async("./.lancedb")
|
||
... table = await conn.create_table("my_table", [{"vector": [99.0, 99.0]}])
|
||
... plan = await table.query().nearest_to([1.0, 2.0]).explain_plan(True)
|
||
... print(plan)
|
||
>>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
|
||
GlobalLimitExec: skip=0, fetch=10
|
||
FilterExec: _distance@2 IS NOT NULL
|
||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST, _rowid@1 ASC NULLS LAST], preserve_partitioning=[false]
|
||
KNNVectorDistance: metric=l2
|
||
LanceRead: uri=..., projection=[vector], ...
|
||
<BLANKLINE>
|
||
|
||
Parameters
|
||
----------
|
||
verbose : bool, default False
|
||
Use a verbose output format.
|
||
|
||
Returns
|
||
-------
|
||
plan : str
|
||
""" # noqa: E501
|
||
return LOOP.run(self._inner.explain_plan(verbose))
|
||
|
||
def analyze_plan(self):
|
||
"""Execute the query and display with runtime metrics.
|
||
|
||
Returns
|
||
-------
|
||
plan : str
|
||
"""
|
||
return LOOP.run(self._inner.analyze_plan())
|
||
|
||
|
||
class LanceTakeQueryBuilder(BaseQueryBuilder):
|
||
"""
|
||
Builder for parameterizing and executing take queries.
|
||
"""
|
||
|
||
def __init__(self, inner: AsyncTakeQuery):
|
||
super().__init__(inner)
|