feat: update to lance 0.25.3b1 (#2294)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **Chores**
- Updated dependency versions for improved performance and
compatibility.

- **New Features**
- Added support for structured full-text search with expanded query
types (e.g., match, phrase, boost, multi-match) and flexible input
formats.
- Introduced a new method to check server support for structural
full-text search features.
- Enhanced the query system with new classes and interfaces for handling
various full-text queries.
- Expanded the functionality of existing methods to accept more complex
query structures, including updates to method signatures.

- **Bug Fixes**
  - Improved error handling and reporting for full-text search queries.

- **Refactor**
- Enhanced query processing with streamlined input handling and improved
error reporting, ensuring more robust and consistent search results
across platforms.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
Co-authored-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
Weston Pace
2025-04-01 06:36:42 -07:00
committed by GitHub
parent e59f9382a0
commit 625bab3f21
25 changed files with 1442 additions and 183 deletions

View File

@@ -4,7 +4,9 @@
from __future__ import annotations
from abc import ABC, abstractmethod
import abc
from concurrent.futures import ThreadPoolExecutor
from enum import Enum
from typing import (
TYPE_CHECKING,
Dict,
@@ -83,6 +85,196 @@ def ensure_vector_query(
return val
class FullTextQueryType(Enum):
MATCH = "match"
MATCH_PHRASE = "match_phrase"
BOOST = "boost"
MULTI_MATCH = "multi_match"
class FullTextQuery(abc.ABC, pydantic.BaseModel):
@abc.abstractmethod
def query_type(self) -> FullTextQueryType:
"""
Get the query type of the query.
Returns
-------
str
The type of the query.
"""
@abc.abstractmethod
def to_dict(self) -> dict:
"""
Convert the query to a dictionary.
Returns
-------
dict
The query as a dictionary.
"""
class MatchQuery(FullTextQuery):
def __init__(
self,
query: str,
column: str,
*,
boost: float = 1.0,
fuzziness: int = 0,
max_expansions: int = 50,
):
"""
Match query for full-text search.
Parameters
----------
query : str
The query string to match against.
column : str
The name of the column to match against.
boost : float, default 1.0
The boost factor for the query.
The score of each matching document is multiplied by this value.
fuzziness : int, optional
The maximum edit distance for each term in the match query.
Defaults to 0 (exact match).
If None, fuzziness is applied automatically by the rules:
- 0 for terms with length <= 2
- 1 for terms with length <= 5
- 2 for terms with length > 5
max_expansions : int, optional
The maximum number of terms to consider for fuzzy matching.
Defaults to 50.
"""
self.column = column
self.query = query
self.boost = boost
self.fuzziness = fuzziness
self.max_expansions = max_expansions
def query_type(self) -> FullTextQueryType:
return FullTextQueryType.MATCH
def to_dict(self) -> dict:
return {
"match": {
self.column: {
"query": self.query,
"boost": self.boost,
"fuzziness": self.fuzziness,
"max_expansions": self.max_expansions,
}
}
}
class PhraseQuery(FullTextQuery):
def __init__(self, query: str, column: str):
"""
Phrase query for full-text search.
Parameters
----------
query : str
The query string to match against.
column : str
The name of the column to match against.
"""
self.column = column
self.query = query
def query_type(self) -> FullTextQueryType:
return FullTextQueryType.MATCH_PHRASE
def to_dict(self) -> dict:
return {
"match_phrase": {
self.column: self.query,
}
}
class BoostQuery(FullTextQuery):
def __init__(
self,
positive: FullTextQuery,
negative: FullTextQuery,
negative_boost: float,
):
"""
Boost query for full-text search.
Parameters
----------
positive : dict
The positive query object.
negative : dict
The negative query object.
negative_boost : float
The boost factor for the negative query.
"""
self.positive = positive
self.negative = negative
self.negative_boost = negative_boost
def query_type(self) -> FullTextQueryType:
return FullTextQueryType.BOOST
def to_dict(self) -> dict:
return {
"boost": {
"positive": self.positive.to_dict(),
"negative": self.negative.to_dict(),
"negative_boost": self.negative_boost,
}
}
class MultiMatchQuery(FullTextQuery):
def __init__(
self,
query: str,
columns: list[str],
*,
boosts: Optional[list[float]] = None,
):
"""
Multi-match query for full-text search.
Parameters
----------
query : str | list[Query]
If a string, the query string to match against.
columns : list[str]
The list of columns to match against.
boosts : list[float], optional
The list of boost factors for each column. If not provided,
all columns will have the same boost factor.
"""
self.query = query
self.columns = columns
if boosts is None:
boosts = [1.0] * len(columns)
self.boosts = boosts
def query_type(self) -> FullTextQueryType:
return FullTextQueryType.MULTI_MATCH
def to_dict(self) -> dict:
return {
"multi_match": {
"query": self.query,
"columns": self.columns,
"boost": self.boosts,
}
}
class FullTextSearchQuery(pydantic.BaseModel):
"""A LanceDB Full Text Search Query
@@ -92,18 +284,13 @@ class FullTextSearchQuery(pydantic.BaseModel):
The columns to search
If None, then the table should select the column automatically.
query: str
The query to search for
limit: Optional[int] = None
The limit on the number of results to return
wand_factor: Optional[float] = None
The wand factor to use for the search
query: str | FullTextQuery
If a string, it is treated as a MatchQuery.
If a FullTextQuery object, it is used directly.
"""
columns: Optional[List[str]] = None
query: str
limit: Optional[int] = None
wand_factor: Optional[float] = None
query: Union[str, FullTextQuery]
class Query(pydantic.BaseModel):
@@ -712,13 +899,14 @@ class LanceQueryBuilder(ABC):
"""
raise NotImplementedError
def text(self, text: str) -> Self:
def text(self, text: str | FullTextQuery) -> Self:
"""Set the text to search for.
Parameters
----------
text: str
The text to search for.
text: str | FullTextQuery
If a string, it is treated as a MatchQuery.
If a FullTextQuery object, it is used directly.
Returns
-------
@@ -1084,7 +1272,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
def __init__(
self,
table: "Table",
query: str,
query: str | FullTextQuery,
ordering_field_name: Optional[str] = None,
fts_columns: Optional[Union[str, List[str]]] = None,
):
@@ -1691,7 +1879,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
self._vector = vector
return self
def text(self, text: str) -> LanceHybridQueryBuilder:
def text(self, text: str | FullTextQuery) -> LanceHybridQueryBuilder:
self._text = text
return self
@@ -2088,7 +2276,7 @@ class AsyncQuery(AsyncQueryBase):
)
def nearest_to_text(
self, query: str, columns: Union[str, List[str], None] = None
self, query: str | FullTextQuery, columns: Union[str, List[str], None] = None
) -> AsyncFTSQuery:
"""
Find the documents that are most relevant to the given text query.
@@ -2114,9 +2302,13 @@ class AsyncQuery(AsyncQueryBase):
columns = [columns]
if columns is None:
columns = []
return AsyncFTSQuery(
self._inner.nearest_to_text({"query": query, "columns": columns})
)
if isinstance(query, str):
return AsyncFTSQuery(
self._inner.nearest_to_text({"query": query, "columns": columns})
)
# FullTextQuery object
return AsyncFTSQuery(self._inner.nearest_to_text(query.to_dict()))
class AsyncFTSQuery(AsyncQueryBase):
@@ -2399,7 +2591,7 @@ class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
return self
def nearest_to_text(
self, query: str, columns: Union[str, List[str], None] = None
self, query: str | FullTextQuery, columns: Union[str, List[str], None] = None
) -> AsyncHybridQuery:
"""
Find the documents that are most relevant to the given text query,
@@ -2429,9 +2621,13 @@ class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
columns = [columns]
if columns is None:
columns = []
return AsyncHybridQuery(
self._inner.nearest_to_text({"query": query, "columns": columns})
)
if isinstance(query, str):
return AsyncHybridQuery(
self._inner.nearest_to_text({"query": query, "columns": columns})
)
# FullTextQuery object
return AsyncHybridQuery(self._inner.nearest_to_text(query.to_dict()))
async def to_batches(
self, *, max_batch_length: Optional[int] = None

View File

@@ -3373,8 +3373,6 @@ class AsyncTable:
async_query = async_query.nearest_to_text(
query.full_text_query.query, query.full_text_query.columns
)
if query.full_text_query.limit is not None:
async_query = async_query.limit(query.full_text_query.limit)
return async_query

View File

@@ -444,6 +444,16 @@ def test_query_sync_fts():
"prefilter": True,
"with_row_id": True,
"version": None,
} or body == {
"full_text_query": {
"query": "puppy",
"columns": ["description", "name"],
},
"k": 42,
"vector": [],
"prefilter": True,
"with_row_id": True,
"version": None,
}
return pa.table({"id": [1, 2, 3]})

View File

@@ -8,19 +8,19 @@ use arrow::array::Array;
use arrow::array::ArrayData;
use arrow::pyarrow::FromPyArrow;
use arrow::pyarrow::IntoPyArrow;
use lancedb::index::scalar::FullTextSearchQuery;
use lancedb::index::scalar::{FtsQuery, FullTextSearchQuery, MatchQuery, PhraseQuery};
use lancedb::query::QueryExecutionOptions;
use lancedb::query::QueryFilter;
use lancedb::query::{
ExecutableQuery, Query as LanceDbQuery, QueryBase, Select, VectorQuery as LanceDbVectorQuery,
};
use lancedb::table::AnyQuery;
use pyo3::exceptions::PyNotImplementedError;
use pyo3::exceptions::PyRuntimeError;
use pyo3::exceptions::{PyNotImplementedError, PyValueError};
use pyo3::prelude::{PyAnyMethods, PyDictMethods};
use pyo3::pymethods;
use pyo3::types::PyDict;
use pyo3::types::PyList;
use pyo3::types::{PyDict, PyString};
use pyo3::Bound;
use pyo3::IntoPyObject;
use pyo3::PyAny;
@@ -31,7 +31,7 @@ use pyo3_async_runtimes::tokio::future_into_py;
use crate::arrow::RecordBatchStream;
use crate::error::PythonErrorExt;
use crate::util::parse_distance_type;
use crate::util::{parse_distance_type, parse_fts_query};
// Python representation of full text search parameters
#[derive(Clone)]
@@ -46,8 +46,8 @@ pub struct PyFullTextSearchQuery {
impl From<FullTextSearchQuery> for PyFullTextSearchQuery {
fn from(query: FullTextSearchQuery) -> Self {
PyFullTextSearchQuery {
columns: query.columns,
query: query.query,
columns: query.columns().into_iter().collect(),
query: query.query.query().to_owned(),
limit: query.limit,
wand_factor: query.wand_factor,
}
@@ -236,22 +236,61 @@ impl Query {
}
pub fn nearest_to_text(&mut self, query: Bound<'_, PyDict>) -> PyResult<FTSQuery> {
let query_text = query
let fts_query = query
.get_item("query")?
.ok_or(PyErr::new::<PyRuntimeError, _>(
"Query text is required for nearest_to_text",
))?
.extract::<String>()?;
let columns = query
.get_item("columns")?
.map(|columns| columns.extract::<Vec<String>>())
.transpose()?;
))?;
let fts_query = FullTextSearchQuery::new(query_text).columns(columns);
let query = if let Ok(query_text) = fts_query.downcast::<PyString>() {
let mut query_text = query_text.to_string();
let columns = query
.get_item("columns")?
.map(|columns| columns.extract::<Vec<String>>())
.transpose()?;
let is_phrase =
query_text.len() >= 2 && query_text.starts_with('"') && query_text.ends_with('"');
let is_multi_match = columns.as_ref().map(|cols| cols.len() > 1).unwrap_or(false);
if is_phrase {
// Remove the surrounding quotes for phrase queries
query_text = query_text[1..query_text.len() - 1].to_string();
}
let query: FtsQuery = match (is_phrase, is_multi_match) {
(false, _) => MatchQuery::new(query_text).into(),
(true, false) => PhraseQuery::new(query_text).into(),
(true, true) => {
return Err(PyValueError::new_err(
"Phrase queries cannot be used with multiple columns.",
));
}
};
let mut query = FullTextSearchQuery::new_query(query);
if let Some(cols) = columns {
if !cols.is_empty() {
query = query.with_columns(&cols).map_err(|e| {
PyValueError::new_err(format!(
"Failed to set full text search columns: {}",
e
))
})?;
}
}
query
} else if let Ok(query) = query.downcast::<PyDict>() {
let query = parse_fts_query(query)?;
FullTextSearchQuery::new_query(query)
} else {
return Err(PyValueError::new_err(
"query must be a string or a Query object",
));
};
Ok(FTSQuery {
fts_query,
inner: self.inner.clone(),
fts_query: query,
})
}
@@ -386,7 +425,7 @@ impl FTSQuery {
}
pub fn get_query(&self) -> String {
self.fts_query.query.clone()
self.fts_query.query.query().to_owned()
}
pub fn to_query_request(&self) -> PyQueryRequest {

View File

@@ -3,11 +3,15 @@
use std::sync::Mutex;
use lancedb::index::scalar::{BoostQuery, FtsQuery, MatchQuery, MultiMatchQuery, PhraseQuery};
use lancedb::DistanceType;
use pyo3::prelude::{PyAnyMethods, PyDictMethods, PyListMethods};
use pyo3::types::PyDict;
use pyo3::{
exceptions::{PyRuntimeError, PyValueError},
pyfunction, PyResult,
};
use pyo3::{Bound, PyAny};
/// A wrapper around a rust builder
///
@@ -59,3 +63,116 @@ pub fn validate_table_name(table_name: &str) -> PyResult<()> {
lancedb::utils::validate_table_name(table_name)
.map_err(|e| PyValueError::new_err(e.to_string()))
}
pub fn parse_fts_query(query: &Bound<'_, PyDict>) -> PyResult<FtsQuery> {
let query_type = query.keys().get_item(0)?.extract::<String>()?;
let query_value = query
.get_item(&query_type)?
.ok_or(PyValueError::new_err(format!(
"Query type {} not found",
query_type
)))?;
let query_value = query_value.downcast::<PyDict>()?;
match query_type.as_str() {
"match" => {
let column = query_value.keys().get_item(0)?.extract::<String>()?;
let params = query_value
.get_item(&column)?
.ok_or(PyValueError::new_err(format!(
"column {} not found",
column
)))?;
let params = params.downcast::<PyDict>()?;
let query = params
.get_item("query")?
.ok_or(PyValueError::new_err("query not found"))?
.extract::<String>()?;
let boost = params
.get_item("boost")?
.ok_or(PyValueError::new_err("boost not found"))?
.extract::<f32>()?;
let fuzziness = params
.get_item("fuzziness")?
.ok_or(PyValueError::new_err("fuzziness not found"))?
.extract::<Option<u32>>()?;
let max_expansions = params
.get_item("max_expansions")?
.ok_or(PyValueError::new_err("max_expansions not found"))?
.extract::<usize>()?;
let query = MatchQuery::new(query)
.with_column(Some(column))
.with_boost(boost)
.with_fuzziness(fuzziness)
.with_max_expansions(max_expansions);
Ok(query.into())
}
"match_phrase" => {
let column = query_value.keys().get_item(0)?.extract::<String>()?;
let query = query_value
.get_item(&column)?
.ok_or(PyValueError::new_err(format!(
"column {} not found",
column
)))?
.extract::<String>()?;
let query = PhraseQuery::new(query).with_column(Some(column));
Ok(query.into())
}
"boost" => {
let positive: Bound<'_, PyAny> = query_value
.get_item("positive")?
.ok_or(PyValueError::new_err("positive not found"))?;
let positive = positive.downcast::<PyDict>()?;
let negative = query_value
.get_item("negative")?
.ok_or(PyValueError::new_err("negative not found"))?;
let negative = negative.downcast::<PyDict>()?;
let negative_boost = query_value
.get_item("negative_boost")?
.ok_or(PyValueError::new_err("negative_boost not found"))?
.extract::<f32>()?;
let positive_query = parse_fts_query(positive)?;
let negative_query = parse_fts_query(negative)?;
let query = BoostQuery::new(positive_query, negative_query, Some(negative_boost));
Ok(query.into())
}
"multi_match" => {
let query = query_value
.get_item("query")?
.ok_or(PyValueError::new_err("query not found"))?
.extract::<String>()?;
let columns = query_value
.get_item("columns")?
.ok_or(PyValueError::new_err("columns not found"))?
.extract::<Vec<String>>()?;
let boost = query_value
.get_item("boost")?
.ok_or(PyValueError::new_err("boost not found"))?
.extract::<Vec<f32>>()?;
let query =
MultiMatchQuery::try_new_with_boosts(query, columns, boost).map_err(|e| {
PyValueError::new_err(format!("Error creating MultiMatchQuery: {}", e))
})?;
Ok(query.into())
}
_ => Err(PyValueError::new_err(format!(
"Unsupported query type: {}",
query_type
))),
}
}