feat: update to lance 0.25.3b1 (#2294)

## Summary by CodeRabbit - **Chores** - Updated dependency versions for improved performance and compatibility. - **New Features** - Added support for structured full-text search with expanded query types (e.g., match, phrase, boost, multi-match) and flexible input formats. - Introduced a new method to check server support for structural full-text search features. - Enhanced the query system with new classes and interfaces for handling various full-text queries. - Expanded the functionality of existing methods to accept more complex query structures, including updates to method signatures. - **Bug Fixes** - Improved error handling and reporting for full-text search queries. - **Refactor** - Enhanced query processing with streamlined input handling and improved error reporting, ensuring more robust and consistent search results across platforms.  --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com> Co-authored-by: BubbleCal <bubble-cal@outlook.com>
2026-01-10 05:42:58 +00:00 · 2025-04-01 06:36:42 -07:00
parent e59f9382a0
commit 625bab3f21
25 changed files with 1442 additions and 183 deletions
--- a/python/python/lancedb/query.py
+++ b/python/python/lancedb/query.py
@@ -4,7 +4,9 @@
 from __future__ import annotations

 from abc import ABC, abstractmethod
+import abc
 from concurrent.futures import ThreadPoolExecutor
+from enum import Enum
 from typing import (
    TYPE_CHECKING,
    Dict,
@@ -83,6 +85,196 @@ def ensure_vector_query(
        return val


+class FullTextQueryType(Enum):
+    MATCH = "match"
+    MATCH_PHRASE = "match_phrase"
+    BOOST = "boost"
+    MULTI_MATCH = "multi_match"
+
+
+class FullTextQuery(abc.ABC, pydantic.BaseModel):
+    @abc.abstractmethod
+    def query_type(self) -> FullTextQueryType:
+        """
+        Get the query type of the query.
+
+        Returns
+        -------
+        str
+            The type of the query.
+        """
+
+    @abc.abstractmethod
+    def to_dict(self) -> dict:
+        """
+        Convert the query to a dictionary.
+
+        Returns
+        -------
+        dict
+            The query as a dictionary.
+        """
+
+
+class MatchQuery(FullTextQuery):
+    def __init__(
+        self,
+        query: str,
+        column: str,
+        *,
+        boost: float = 1.0,
+        fuzziness: int = 0,
+        max_expansions: int = 50,
+    ):
+        """
+        Match query for full-text search.
+
+        Parameters
+        ----------
+        query : str
+            The query string to match against.
+        column : str
+            The name of the column to match against.
+        boost : float, default 1.0
+            The boost factor for the query.
+            The score of each matching document is multiplied by this value.
+        fuzziness : int, optional
+            The maximum edit distance for each term in the match query.
+            Defaults to 0 (exact match).
+            If None, fuzziness is applied automatically by the rules:
+                - 0 for terms with length <= 2
+                - 1 for terms with length <= 5
+                - 2 for terms with length > 5
+        max_expansions : int, optional
+            The maximum number of terms to consider for fuzzy matching.
+            Defaults to 50.
+        """
+        self.column = column
+        self.query = query
+        self.boost = boost
+        self.fuzziness = fuzziness
+        self.max_expansions = max_expansions
+
+    def query_type(self) -> FullTextQueryType:
+        return FullTextQueryType.MATCH
+
+    def to_dict(self) -> dict:
+        return {
+            "match": {
+                self.column: {
+                    "query": self.query,
+                    "boost": self.boost,
+                    "fuzziness": self.fuzziness,
+                    "max_expansions": self.max_expansions,
+                }
+            }
+        }
+
+
+class PhraseQuery(FullTextQuery):
+    def __init__(self, query: str, column: str):
+        """
+        Phrase query for full-text search.
+
+        Parameters
+        ----------
+        query : str
+            The query string to match against.
+        column : str
+            The name of the column to match against.
+        """
+        self.column = column
+        self.query = query
+
+    def query_type(self) -> FullTextQueryType:
+        return FullTextQueryType.MATCH_PHRASE
+
+    def to_dict(self) -> dict:
+        return {
+            "match_phrase": {
+                self.column: self.query,
+            }
+        }
+
+
+class BoostQuery(FullTextQuery):
+    def __init__(
+        self,
+        positive: FullTextQuery,
+        negative: FullTextQuery,
+        negative_boost: float,
+    ):
+        """
+        Boost query for full-text search.
+
+        Parameters
+        ----------
+        positive : dict
+            The positive query object.
+        negative : dict
+            The negative query object.
+        negative_boost : float
+            The boost factor for the negative query.
+        """
+        self.positive = positive
+        self.negative = negative
+        self.negative_boost = negative_boost
+
+    def query_type(self) -> FullTextQueryType:
+        return FullTextQueryType.BOOST
+
+    def to_dict(self) -> dict:
+        return {
+            "boost": {
+                "positive": self.positive.to_dict(),
+                "negative": self.negative.to_dict(),
+                "negative_boost": self.negative_boost,
+            }
+        }
+
+
+class MultiMatchQuery(FullTextQuery):
+    def __init__(
+        self,
+        query: str,
+        columns: list[str],
+        *,
+        boosts: Optional[list[float]] = None,
+    ):
+        """
+        Multi-match query for full-text search.
+
+        Parameters
+        ----------
+        query : str | list[Query]
+            If a string, the query string to match against.
+
+        columns : list[str]
+            The list of columns to match against.
+
+        boosts : list[float], optional
+            The list of boost factors for each column. If not provided,
+            all columns will have the same boost factor.
+        """
+        self.query = query
+        self.columns = columns
+        if boosts is None:
+            boosts = [1.0] * len(columns)
+        self.boosts = boosts
+
+    def query_type(self) -> FullTextQueryType:
+        return FullTextQueryType.MULTI_MATCH
+
+    def to_dict(self) -> dict:
+        return {
+            "multi_match": {
+                "query": self.query,
+                "columns": self.columns,
+                "boost": self.boosts,
+            }
+        }
+
+
 class FullTextSearchQuery(pydantic.BaseModel):
    """A LanceDB Full Text Search Query

@@ -92,18 +284,13 @@ class FullTextSearchQuery(pydantic.BaseModel):
        The columns to search

        If None, then the table should select the column automatically.
-    query: str
-        The query to search for
-    limit: Optional[int] = None
-        The limit on the number of results to return
-    wand_factor: Optional[float] = None
-        The wand factor to use for the search
+    query: str | FullTextQuery
+        If a string, it is treated as a MatchQuery.
+        If a FullTextQuery object, it is used directly.
    """

    columns: Optional[List[str]] = None
-    query: str
-    limit: Optional[int] = None
-    wand_factor: Optional[float] = None
+    query: Union[str, FullTextQuery]


 class Query(pydantic.BaseModel):
@@ -712,13 +899,14 @@ class LanceQueryBuilder(ABC):
        """
        raise NotImplementedError

-    def text(self, text: str) -> Self:
+    def text(self, text: str | FullTextQuery) -> Self:
        """Set the text to search for.

        Parameters
        ----------
-        text: str
-            The text to search for.
+        text: str | FullTextQuery
+            If a string, it is treated as a MatchQuery.
+            If a FullTextQuery object, it is used directly.

        Returns
        -------
@@ -1084,7 +1272,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
    def __init__(
        self,
        table: "Table",
-        query: str,
+        query: str | FullTextQuery,
        ordering_field_name: Optional[str] = None,
        fts_columns: Optional[Union[str, List[str]]] = None,
    ):
@@ -1691,7 +1879,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
        self._vector = vector
        return self

-    def text(self, text: str) -> LanceHybridQueryBuilder:
+    def text(self, text: str | FullTextQuery) -> LanceHybridQueryBuilder:
        self._text = text
        return self

@@ -2088,7 +2276,7 @@ class AsyncQuery(AsyncQueryBase):
            )

    def nearest_to_text(
-        self, query: str, columns: Union[str, List[str], None] = None
+        self, query: str | FullTextQuery, columns: Union[str, List[str], None] = None
    ) -> AsyncFTSQuery:
        """
        Find the documents that are most relevant to the given text query.
@@ -2114,9 +2302,13 @@ class AsyncQuery(AsyncQueryBase):
            columns = [columns]
        if columns is None:
            columns = []
-        return AsyncFTSQuery(
-            self._inner.nearest_to_text({"query": query, "columns": columns})
-        )
+
+        if isinstance(query, str):
+            return AsyncFTSQuery(
+                self._inner.nearest_to_text({"query": query, "columns": columns})
+            )
+        # FullTextQuery object
+        return AsyncFTSQuery(self._inner.nearest_to_text(query.to_dict()))


 class AsyncFTSQuery(AsyncQueryBase):
@@ -2399,7 +2591,7 @@ class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
        return self

    def nearest_to_text(
-        self, query: str, columns: Union[str, List[str], None] = None
+        self, query: str | FullTextQuery, columns: Union[str, List[str], None] = None
    ) -> AsyncHybridQuery:
        """
        Find the documents that are most relevant to the given text query,
@@ -2429,9 +2621,13 @@ class AsyncVectorQuery(AsyncQueryBase, AsyncVectorQueryBase):
            columns = [columns]
        if columns is None:
            columns = []
-        return AsyncHybridQuery(
-            self._inner.nearest_to_text({"query": query, "columns": columns})
-        )
+
+        if isinstance(query, str):
+            return AsyncHybridQuery(
+                self._inner.nearest_to_text({"query": query, "columns": columns})
+            )
+        # FullTextQuery object
+        return AsyncHybridQuery(self._inner.nearest_to_text(query.to_dict()))

    async def to_batches(
        self, *, max_batch_length: Optional[int] = None
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -3373,8 +3373,6 @@ class AsyncTable:
            async_query = async_query.nearest_to_text(
                query.full_text_query.query, query.full_text_query.columns
            )
-            if query.full_text_query.limit is not None:
-                async_query = async_query.limit(query.full_text_query.limit)

        return async_query

--- a/python/python/tests/test_remote_db.py
+++ b/python/python/tests/test_remote_db.py
@@ -444,6 +444,16 @@ def test_query_sync_fts():
            "prefilter": True,
            "with_row_id": True,
            "version": None,
+        } or body == {
+            "full_text_query": {
+                "query": "puppy",
+                "columns": ["description", "name"],
+            },
+            "k": 42,
+            "vector": [],
+            "prefilter": True,
+            "with_row_id": True,
+            "version": None,
        }

        return pa.table({"id": [1, 2, 3]})
--- a/python/src/query.rs
+++ b/python/src/query.rs
@@ -8,19 +8,19 @@ use arrow::array::Array;
 use arrow::array::ArrayData;
 use arrow::pyarrow::FromPyArrow;
 use arrow::pyarrow::IntoPyArrow;
-use lancedb::index::scalar::FullTextSearchQuery;
+use lancedb::index::scalar::{FtsQuery, FullTextSearchQuery, MatchQuery, PhraseQuery};
 use lancedb::query::QueryExecutionOptions;
 use lancedb::query::QueryFilter;
 use lancedb::query::{
    ExecutableQuery, Query as LanceDbQuery, QueryBase, Select, VectorQuery as LanceDbVectorQuery,
 };
 use lancedb::table::AnyQuery;
-use pyo3::exceptions::PyNotImplementedError;
 use pyo3::exceptions::PyRuntimeError;
+use pyo3::exceptions::{PyNotImplementedError, PyValueError};
 use pyo3::prelude::{PyAnyMethods, PyDictMethods};
 use pyo3::pymethods;
-use pyo3::types::PyDict;
 use pyo3::types::PyList;
+use pyo3::types::{PyDict, PyString};
 use pyo3::Bound;
 use pyo3::IntoPyObject;
 use pyo3::PyAny;
@@ -31,7 +31,7 @@ use pyo3_async_runtimes::tokio::future_into_py;

 use crate::arrow::RecordBatchStream;
 use crate::error::PythonErrorExt;
-use crate::util::parse_distance_type;
+use crate::util::{parse_distance_type, parse_fts_query};

 // Python representation of full text search parameters
 #[derive(Clone)]
@@ -46,8 +46,8 @@ pub struct PyFullTextSearchQuery {
 impl From<FullTextSearchQuery> for PyFullTextSearchQuery {
    fn from(query: FullTextSearchQuery) -> Self {
        PyFullTextSearchQuery {
-            columns: query.columns,
-            query: query.query,
+            columns: query.columns().into_iter().collect(),
+            query: query.query.query().to_owned(),
            limit: query.limit,
            wand_factor: query.wand_factor,
        }
@@ -236,22 +236,61 @@ impl Query {
    }

    pub fn nearest_to_text(&mut self, query: Bound<'_, PyDict>) -> PyResult<FTSQuery> {
-        let query_text = query
+        let fts_query = query
            .get_item("query")?
            .ok_or(PyErr::new::<PyRuntimeError, _>(
                "Query text is required for nearest_to_text",
-            ))?
-            .extract::<String>()?;
-        let columns = query
-            .get_item("columns")?
-            .map(|columns| columns.extract::<Vec<String>>())
-            .transpose()?;
+            ))?;

-        let fts_query = FullTextSearchQuery::new(query_text).columns(columns);
+        let query = if let Ok(query_text) = fts_query.downcast::<PyString>() {
+            let mut query_text = query_text.to_string();
+            let columns = query
+                .get_item("columns")?
+                .map(|columns| columns.extract::<Vec<String>>())
+                .transpose()?;
+
+            let is_phrase =
+                query_text.len() >= 2 && query_text.starts_with('"') && query_text.ends_with('"');
+            let is_multi_match = columns.as_ref().map(|cols| cols.len() > 1).unwrap_or(false);
+
+            if is_phrase {
+                // Remove the surrounding quotes for phrase queries
+                query_text = query_text[1..query_text.len() - 1].to_string();
+            }
+
+            let query: FtsQuery = match (is_phrase, is_multi_match) {
+                (false, _) => MatchQuery::new(query_text).into(),
+                (true, false) => PhraseQuery::new(query_text).into(),
+                (true, true) => {
+                    return Err(PyValueError::new_err(
+                        "Phrase queries cannot be used with multiple columns.",
+                    ));
+                }
+            };
+            let mut query = FullTextSearchQuery::new_query(query);
+            if let Some(cols) = columns {
+                if !cols.is_empty() {
+                    query = query.with_columns(&cols).map_err(|e| {
+                        PyValueError::new_err(format!(
+                            "Failed to set full text search columns: {}",
+                            e
+                        ))
+                    })?;
+                }
+            }
+            query
+        } else if let Ok(query) = query.downcast::<PyDict>() {
+            let query = parse_fts_query(query)?;
+            FullTextSearchQuery::new_query(query)
+        } else {
+            return Err(PyValueError::new_err(
+                "query must be a string or a Query object",
+            ));
+        };

        Ok(FTSQuery {
-            fts_query,
            inner: self.inner.clone(),
+            fts_query: query,
        })
    }

@@ -386,7 +425,7 @@ impl FTSQuery {
    }

    pub fn get_query(&self) -> String {
-        self.fts_query.query.clone()
+        self.fts_query.query.query().to_owned()
    }

    pub fn to_query_request(&self) -> PyQueryRequest {
--- a/python/src/util.rs
+++ b/python/src/util.rs
@@ -3,11 +3,15 @@

 use std::sync::Mutex;

+use lancedb::index::scalar::{BoostQuery, FtsQuery, MatchQuery, MultiMatchQuery, PhraseQuery};
 use lancedb::DistanceType;
+use pyo3::prelude::{PyAnyMethods, PyDictMethods, PyListMethods};
+use pyo3::types::PyDict;
 use pyo3::{
    exceptions::{PyRuntimeError, PyValueError},
    pyfunction, PyResult,
 };
+use pyo3::{Bound, PyAny};

 /// A wrapper around a rust builder
 ///
@@ -59,3 +63,116 @@ pub fn validate_table_name(table_name: &str) -> PyResult<()> {
    lancedb::utils::validate_table_name(table_name)
        .map_err(|e| PyValueError::new_err(e.to_string()))
 }
+
+pub fn parse_fts_query(query: &Bound<'_, PyDict>) -> PyResult<FtsQuery> {
+    let query_type = query.keys().get_item(0)?.extract::<String>()?;
+    let query_value = query
+        .get_item(&query_type)?
+        .ok_or(PyValueError::new_err(format!(
+            "Query type {} not found",
+            query_type
+        )))?;
+    let query_value = query_value.downcast::<PyDict>()?;
+
+    match query_type.as_str() {
+        "match" => {
+            let column = query_value.keys().get_item(0)?.extract::<String>()?;
+            let params = query_value
+                .get_item(&column)?
+                .ok_or(PyValueError::new_err(format!(
+                    "column {} not found",
+                    column
+                )))?;
+            let params = params.downcast::<PyDict>()?;
+
+            let query = params
+                .get_item("query")?
+                .ok_or(PyValueError::new_err("query not found"))?
+                .extract::<String>()?;
+            let boost = params
+                .get_item("boost")?
+                .ok_or(PyValueError::new_err("boost not found"))?
+                .extract::<f32>()?;
+            let fuzziness = params
+                .get_item("fuzziness")?
+                .ok_or(PyValueError::new_err("fuzziness not found"))?
+                .extract::<Option<u32>>()?;
+            let max_expansions = params
+                .get_item("max_expansions")?
+                .ok_or(PyValueError::new_err("max_expansions not found"))?
+                .extract::<usize>()?;
+
+            let query = MatchQuery::new(query)
+                .with_column(Some(column))
+                .with_boost(boost)
+                .with_fuzziness(fuzziness)
+                .with_max_expansions(max_expansions);
+            Ok(query.into())
+        }
+
+        "match_phrase" => {
+            let column = query_value.keys().get_item(0)?.extract::<String>()?;
+            let query = query_value
+                .get_item(&column)?
+                .ok_or(PyValueError::new_err(format!(
+                    "column {} not found",
+                    column
+                )))?
+                .extract::<String>()?;
+
+            let query = PhraseQuery::new(query).with_column(Some(column));
+            Ok(query.into())
+        }
+
+        "boost" => {
+            let positive: Bound<'_, PyAny> = query_value
+                .get_item("positive")?
+                .ok_or(PyValueError::new_err("positive not found"))?;
+            let positive = positive.downcast::<PyDict>()?;
+
+            let negative = query_value
+                .get_item("negative")?
+                .ok_or(PyValueError::new_err("negative not found"))?;
+            let negative = negative.downcast::<PyDict>()?;
+
+            let negative_boost = query_value
+                .get_item("negative_boost")?
+                .ok_or(PyValueError::new_err("negative_boost not found"))?
+                .extract::<f32>()?;
+
+            let positive_query = parse_fts_query(positive)?;
+            let negative_query = parse_fts_query(negative)?;
+            let query = BoostQuery::new(positive_query, negative_query, Some(negative_boost));
+
+            Ok(query.into())
+        }
+
+        "multi_match" => {
+            let query = query_value
+                .get_item("query")?
+                .ok_or(PyValueError::new_err("query not found"))?
+                .extract::<String>()?;
+
+            let columns = query_value
+                .get_item("columns")?
+                .ok_or(PyValueError::new_err("columns not found"))?
+                .extract::<Vec<String>>()?;
+
+            let boost = query_value
+                .get_item("boost")?
+                .ok_or(PyValueError::new_err("boost not found"))?
+                .extract::<Vec<f32>>()?;
+
+            let query =
+                MultiMatchQuery::try_new_with_boosts(query, columns, boost).map_err(|e| {
+                    PyValueError::new_err(format!("Error creating MultiMatchQuery: {}", e))
+                })?;
+            Ok(query.into())
+        }
+
+        _ => Err(PyValueError::new_err(format!(
+            "Unsupported query type: {}",
+            query_type
+        ))),
+    }
+}