Bump version: 0.25.3-beta.4 → 0.25.3-beta.5

feat: add fts udtf in sql (#2755 )
Support FTS feature parity in SQL to match current Python API capability. Add `.to_json()` method to FTS query classes to enable usage with SQL `fts()` UDTF. Related: https://github.com/lancedb/blog-lancedb/pull/147 query = MatchQuery("puppy", "text", fuzziness=2) result = client.execute(f"SELECT * FROM fts('table', '{query.to_json()}')") --------- Co-authored-by: Claude <noreply@anthropic.com>
2025-12-24 22:09:58 +00:00 · 2025-10-31 17:07:31 +00:00 · 2025-10-31 10:06:19 -07:00 · 2025-10-31 01:14:39 +00:00
29 changed files with 2296 additions and 34 deletions
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "0.22.3-beta.3"
+current_version = "0.22.3-beta.4"
 parse = """(?x)
    (?P<major>0|[1-9]\\d*)\\.
    (?P<minor>0|[1-9]\\d*)\\.
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4684,7 +4684,7 @@ dependencies = [

 [[package]]
 name = "lancedb"
-version = "0.22.3-beta.3"
+version = "0.22.3-beta.4"
 dependencies = [
 "ahash",
 "anyhow",
@@ -4781,7 +4781,7 @@ dependencies = [

 [[package]]
 name = "lancedb-nodejs"
-version = "0.22.3-beta.3"
+version = "0.22.3-beta.4"
 dependencies = [
 "arrow-array",
 "arrow-ipc",
@@ -4801,7 +4801,7 @@ dependencies = [

 [[package]]
 name = "lancedb-python"
-version = "0.25.3-beta.3"
+version = "0.25.3-beta.4"
 dependencies = [
 "arrow",
 "async-trait",
--- a/java/core/pom.xml
+++ b/java/core/pom.xml
@@ -8,7 +8,7 @@
    <parent>
        <groupId>com.lancedb</groupId>
        <artifactId>lancedb-parent</artifactId>
-        <version>0.22.3-beta.3</version>
+        <version>0.22.3-beta.4</version>
        <relativePath>../pom.xml</relativePath>
    </parent>

--- a/java/lance-namespace/pom.xml
+++ b/java/lance-namespace/pom.xml
@@ -8,7 +8,7 @@
    <parent>
        <groupId>com.lancedb</groupId>
        <artifactId>lancedb-parent</artifactId>
-        <version>0.22.3-beta.3</version>
+        <version>0.22.3-beta.4</version>
        <relativePath>../pom.xml</relativePath>
    </parent>

--- a/java/pom.xml
+++ b/java/pom.xml
@@ -6,7 +6,7 @@

    <groupId>com.lancedb</groupId>
    <artifactId>lancedb-parent</artifactId>
-    <version>0.22.3-beta.3</version>
+    <version>0.22.3-beta.4</version>
    <packaging>pom</packaging>
    <name>${project.artifactId}</name>
    <description>LanceDB Java SDK Parent POM</description>
--- a/nodejs/Cargo.toml
+++ b/nodejs/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "lancedb-nodejs"
 edition.workspace = true
-version = "0.22.3-beta.3"
+version = "0.22.3-beta.4"
 license.workspace = true
 description.workspace = true
 repository.workspace = true
--- a/nodejs/npm/darwin-arm64/package.json
+++ b/nodejs/npm/darwin-arm64/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-darwin-arm64",
-	"version": "0.22.3-beta.3",
+	"version": "0.22.3-beta.4",
 	"os": ["darwin"],
 	"cpu": ["arm64"],
 	"main": "lancedb.darwin-arm64.node",
--- a/nodejs/npm/darwin-x64/package.json
+++ b/nodejs/npm/darwin-x64/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-darwin-x64",
-	"version": "0.22.3-beta.3",
+	"version": "0.22.3-beta.4",
 	"os": ["darwin"],
 	"cpu": ["x64"],
 	"main": "lancedb.darwin-x64.node",
--- a/nodejs/npm/linux-arm64-gnu/package.json
+++ b/nodejs/npm/linux-arm64-gnu/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-arm64-gnu",
-	"version": "0.22.3-beta.3",
+	"version": "0.22.3-beta.4",
 	"os": ["linux"],
 	"cpu": ["arm64"],
 	"main": "lancedb.linux-arm64-gnu.node",
--- a/nodejs/npm/linux-arm64-musl/package.json
+++ b/nodejs/npm/linux-arm64-musl/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-arm64-musl",
-	"version": "0.22.3-beta.3",
+	"version": "0.22.3-beta.4",
 	"os": ["linux"],
 	"cpu": ["arm64"],
 	"main": "lancedb.linux-arm64-musl.node",
--- a/nodejs/npm/linux-x64-gnu/package.json
+++ b/nodejs/npm/linux-x64-gnu/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-x64-gnu",
-	"version": "0.22.3-beta.3",
+	"version": "0.22.3-beta.4",
 	"os": ["linux"],
 	"cpu": ["x64"],
 	"main": "lancedb.linux-x64-gnu.node",
--- a/nodejs/npm/linux-x64-musl/package.json
+++ b/nodejs/npm/linux-x64-musl/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-x64-musl",
-	"version": "0.22.3-beta.3",
+	"version": "0.22.3-beta.4",
 	"os": ["linux"],
 	"cpu": ["x64"],
 	"main": "lancedb.linux-x64-musl.node",
--- a/nodejs/npm/win32-arm64-msvc/package.json
+++ b/nodejs/npm/win32-arm64-msvc/package.json
@@ -1,6 +1,6 @@
 {
  "name": "@lancedb/lancedb-win32-arm64-msvc",
-  "version": "0.22.3-beta.3",
+  "version": "0.22.3-beta.4",
  "os": [
    "win32"
  ],
--- a/nodejs/npm/win32-x64-msvc/package.json
+++ b/nodejs/npm/win32-x64-msvc/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-win32-x64-msvc",
-	"version": "0.22.3-beta.3",
+	"version": "0.22.3-beta.4",
 	"os": ["win32"],
 	"cpu": ["x64"],
 	"main": "lancedb.win32-x64-msvc.node",
--- a/nodejs/package-lock.json
+++ b/nodejs/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "@lancedb/lancedb",
-  "version": "0.22.3-beta.3",
+  "version": "0.22.3-beta.4",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "@lancedb/lancedb",
-      "version": "0.22.3-beta.3",
+      "version": "0.22.3-beta.4",
      "cpu": [
        "x64",
        "arm64"
--- a/nodejs/package.json
+++ b/nodejs/package.json
@@ -11,7 +11,7 @@
    "ann"
  ],
  "private": false,
-  "version": "0.22.3-beta.3",
+  "version": "0.22.3-beta.4",
  "main": "dist/index.js",
  "exports": {
    ".": "./dist/index.js",
--- a/python/.bumpversion.toml
+++ b/python/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "0.25.3-beta.4"
+current_version = "0.25.3-beta.5"
 parse = """(?x)
    (?P<major>0|[1-9]\\d*)\\.
    (?P<minor>0|[1-9]\\d*)\\.
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb-python"
-version = "0.25.3-beta.4"
+version = "0.25.3-beta.5"
 edition.workspace = true
 description = "Python bindings for LanceDB"
 license.workspace = true
--- a/python/python/lancedb/_lancedb.pyi
+++ b/python/python/lancedb/_lancedb.pyi
@@ -339,3 +339,4 @@ class AsyncPermutationBuilder:
 def async_permutation_builder(
    table: Table, dest_table_name: str
 ) -> AsyncPermutationBuilder: ...
+def fts_query_to_json(query: Any) -> str: ...
--- a/python/python/lancedb/query.py
+++ b/python/python/lancedb/query.py
@@ -37,7 +37,7 @@ from .rerankers.base import Reranker
 from .rerankers.rrf import RRFReranker
 from .rerankers.util import check_reranker_result
 from .util import flatten_columns
-
+from lancedb._lancedb import fts_query_to_json
 from typing_extensions import Annotated

 if TYPE_CHECKING:
@@ -124,6 +124,24 @@ class FullTextQuery(ABC):
        """
        pass

+    def to_json(self) -> str:
+        """
+        Convert the query to a JSON string.
+
+        Returns
+        -------
+        str
+            A JSON string representation of the query.
+
+        Examples
+        --------
+        >>> from lancedb.query import MatchQuery
+        >>> query = MatchQuery("puppy", "text", fuzziness=2)
+        >>> query.to_json()
+        '{"match":{"column":"text","terms":"puppy","boost":1.0,"fuzziness":2,"max_expansions":50,"operator":"Or","prefix_length":0}}'
+        """
+        return fts_query_to_json(self)
+
    def __and__(self, other: "FullTextQuery") -> "FullTextQuery":
        """
        Combine two queries with a logical AND operation.
@@ -288,6 +306,8 @@ class BooleanQuery(FullTextQuery):
    ----------
    queries : list[tuple(Occur, FullTextQuery)]
        The list of queries with their occurrence requirements.
+        Each tuple contains an Occur value (MUST, SHOULD, or MUST_NOT)
+        and a FullTextQuery to apply.
    """

    queries: list[tuple[Occur, FullTextQuery]]
--- a/python/python/tests/test_fts.py
+++ b/python/python/tests/test_fts.py
@@ -20,7 +20,14 @@ from unittest import mock
 import lancedb as ldb
 from lancedb.db import DBConnection
 from lancedb.index import FTS
-from lancedb.query import BoostQuery, MatchQuery, MultiMatchQuery, PhraseQuery
+from lancedb.query import (
+    BoostQuery,
+    MatchQuery,
+    MultiMatchQuery,
+    PhraseQuery,
+    BooleanQuery,
+    Occur,
+)
 import numpy as np
 import pyarrow as pa
 import pandas as pd
@@ -727,3 +734,146 @@ def test_fts_ngram(mem_db: DBConnection):
    results = table.search("la", query_type="fts").limit(10).to_list()
    assert len(results) == 2
    assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
+
+
+def test_fts_query_to_json():
+    """Test that FTS query to_json() produces valid JSON strings with exact format."""
+
+    # Test MatchQuery - basic
+    match_query = MatchQuery("hello world", "text")
+    json_str = match_query.to_json()
+    expected = (
+        '{"match":{"column":"text","terms":"hello world","boost":1.0,'
+        '"fuzziness":0,"max_expansions":50,"operator":"Or","prefix_length":0}}'
+    )
+    assert json_str == expected
+
+    # Test MatchQuery with options
+    match_query = MatchQuery("puppy", "text", fuzziness=2, boost=1.5, prefix_length=3)
+    json_str = match_query.to_json()
+    expected = (
+        '{"match":{"column":"text","terms":"puppy","boost":1.5,"fuzziness":2,'
+        '"max_expansions":50,"operator":"Or","prefix_length":3}}'
+    )
+    assert json_str == expected
+
+    # Test PhraseQuery
+    phrase_query = PhraseQuery("quick brown fox", "title")
+    json_str = phrase_query.to_json()
+    expected = '{"phrase":{"column":"title","terms":"quick brown fox","slop":0}}'
+    assert json_str == expected
+
+    # Test PhraseQuery with slop
+    phrase_query = PhraseQuery("quick brown", "title", slop=2)
+    json_str = phrase_query.to_json()
+    expected = '{"phrase":{"column":"title","terms":"quick brown","slop":2}}'
+    assert json_str == expected
+
+    # Test BooleanQuery with MUST
+    must_query = BooleanQuery(
+        [
+            (Occur.MUST, MatchQuery("puppy", "text")),
+            (Occur.MUST, MatchQuery("runs", "text")),
+        ]
+    )
+    json_str = must_query.to_json()
+    expected = (
+        '{"boolean":{"should":[],"must":[{"match":{"column":"text","terms":"puppy",'
+        '"boost":1.0,"fuzziness":0,"max_expansions":50,"operator":"Or",'
+        '"prefix_length":0}},{"match":{"column":"text","terms":"runs","boost":1.0,'
+        '"fuzziness":0,"max_expansions":50,"operator":"Or","prefix_length":0}}],'
+        '"must_not":[]}}'
+    )
+    assert json_str == expected
+
+    # Test BooleanQuery with SHOULD
+    should_query = BooleanQuery(
+        [
+            (Occur.SHOULD, MatchQuery("cat", "text")),
+            (Occur.SHOULD, MatchQuery("dog", "text")),
+        ]
+    )
+    json_str = should_query.to_json()
+    expected = (
+        '{"boolean":{"should":[{"match":{"column":"text","terms":"cat","boost":1.0,'
+        '"fuzziness":0,"max_expansions":50,"operator":"Or","prefix_length":0}},'
+        '{"match":{"column":"text","terms":"dog","boost":1.0,"fuzziness":0,'
+        '"max_expansions":50,"operator":"Or","prefix_length":0}}],"must":[],'
+        '"must_not":[]}}'
+    )
+    assert json_str == expected
+
+    # Test BooleanQuery with MUST_NOT
+    must_not_query = BooleanQuery(
+        [
+            (Occur.MUST, MatchQuery("puppy", "text")),
+            (Occur.MUST_NOT, MatchQuery("training", "text")),
+        ]
+    )
+    json_str = must_not_query.to_json()
+    expected = (
+        '{"boolean":{"should":[],"must":[{"match":{"column":"text","terms":"puppy",'
+        '"boost":1.0,"fuzziness":0,"max_expansions":50,"operator":"Or",'
+        '"prefix_length":0}}],"must_not":[{"match":{"column":"text",'
+        '"terms":"training","boost":1.0,"fuzziness":0,"max_expansions":50,'
+        '"operator":"Or","prefix_length":0}}]}}'
+    )
+    assert json_str == expected
+
+    # Test BoostQuery
+    positive = MatchQuery("puppy", "text")
+    negative = MatchQuery("training", "text")
+    boost_query = BoostQuery(positive, negative, negative_boost=0.3)
+    json_str = boost_query.to_json()
+    expected = (
+        '{"boost":{"positive":{"match":{"column":"text","terms":"puppy",'
+        '"boost":1.0,"fuzziness":0,"max_expansions":50,"operator":"Or",'
+        '"prefix_length":0}},"negative":{"match":{"column":"text",'
+        '"terms":"training","boost":1.0,"fuzziness":0,"max_expansions":50,'
+        '"operator":"Or","prefix_length":0}},"negative_boost":0.3}}'
+    )
+    assert json_str == expected
+
+    # Test MultiMatchQuery
+    multi_match = MultiMatchQuery("python", ["tags", "title"])
+    json_str = multi_match.to_json()
+    expected = (
+        '{"multi_match":{"query":"python","columns":["tags","title"],'
+        '"boost":[1.0,1.0]}}'
+    )
+    assert json_str == expected
+
+    # Test complex nested BooleanQuery
+    inner1 = BooleanQuery(
+        [
+            (Occur.MUST, MatchQuery("python", "tags")),
+            (Occur.MUST, MatchQuery("tutorial", "title")),
+        ]
+    )
+    inner2 = BooleanQuery(
+        [
+            (Occur.MUST, MatchQuery("rust", "tags")),
+            (Occur.MUST, MatchQuery("guide", "title")),
+        ]
+    )
+    complex_query = BooleanQuery(
+        [
+            (Occur.SHOULD, inner1),
+            (Occur.SHOULD, inner2),
+        ]
+    )
+    json_str = complex_query.to_json()
+    expected = (
+        '{"boolean":{"should":[{"boolean":{"should":[],"must":[{"match":'
+        '{"column":"tags","terms":"python","boost":1.0,"fuzziness":0,'
+        '"max_expansions":50,"operator":"Or","prefix_length":0}},{"match":'
+        '{"column":"title","terms":"tutorial","boost":1.0,"fuzziness":0,'
+        '"max_expansions":50,"operator":"Or","prefix_length":0}}],"must_not":[]}}'
+        ',{"boolean":{"should":[],"must":[{"match":{"column":"tags",'
+        '"terms":"rust","boost":1.0,"fuzziness":0,"max_expansions":50,'
+        '"operator":"Or","prefix_length":0}},{"match":{"column":"title",'
+        '"terms":"guide","boost":1.0,"fuzziness":0,"max_expansions":50,'
+        '"operator":"Or","prefix_length":0}}],"must_not":[]}}],"must":[],'
+        '"must_not":[]}}'
+    )
+    assert json_str == expected
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -55,6 +55,7 @@ pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
    m.add_function(wrap_pyfunction!(connect, m)?)?;
    m.add_function(wrap_pyfunction!(permutation::async_permutation_builder, m)?)?;
    m.add_function(wrap_pyfunction!(util::validate_table_name, m)?)?;
+    m.add_function(wrap_pyfunction!(query::fts_query_to_json, m)?)?;
    m.add("__version__", env!("CARGO_PKG_VERSION"))?;
    Ok(())
 }
--- a/python/src/query.rs
+++ b/python/src/query.rs
@@ -23,6 +23,7 @@ use lancedb::query::{
 };
 use lancedb::table::AnyQuery;
 use pyo3::prelude::{PyAnyMethods, PyDictMethods};
+use pyo3::pyfunction;
 use pyo3::pymethods;
 use pyo3::types::PyList;
 use pyo3::types::{PyDict, PyString};
@@ -982,3 +983,15 @@ impl HybridQuery {
        req
    }
 }
+
+/// Convert a Python FTS query to JSON string
+#[pyfunction]
+pub fn fts_query_to_json(query_obj: &Bound<'_, PyAny>) -> PyResult<String> {
+    let wrapped: PyLanceDB<FtsQuery> = query_obj.extract()?;
+    lancedb::table::datafusion::udtf::fts::to_json(&wrapped.0).map_err(|e| {
+        PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
+            "Failed to serialize FTS query to JSON: {}",
+            e
+        ))
+    })
+}
--- a/rust/lancedb/Cargo.toml
+++ b/rust/lancedb/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb"
-version = "0.22.3-beta.3"
+version = "0.22.3-beta.4"
 edition.workspace = true
 description = "LanceDB: A serverless, low-latency vector database for AI applications"
 license.workspace = true
--- a/rust/lancedb/src/query.rs
+++ b/rust/lancedb/src/query.rs
@@ -667,6 +667,12 @@ pub struct QueryRequest {

    /// Configure how query results are normalized when doing hybrid search
    pub norm: Option<NormalizeMethod>,
+
+    /// If set to true, disables automatic projection of scoring columns (_score, _distance).
+    /// When disabled, these columns are only included if explicitly requested in the projection.
+    ///
+    /// By default, this is false (scoring columns are auto-projected for backward compatibility).
+    pub disable_scoring_autoprojection: bool,
 }

 impl Default for QueryRequest {
@@ -682,6 +688,7 @@ impl Default for QueryRequest {
            prefilter: true,
            reranker: None,
            norm: None,
+            disable_scoring_autoprojection: false,
        }
    }
 }
--- a/rust/lancedb/src/table.rs
+++ b/rust/lancedb/src/table.rs
@@ -2391,6 +2391,10 @@ impl BaseTable for NativeTable {
            scanner.distance_metric(distance_type.into());
        }

+        if query.base.disable_scoring_autoprojection {
+            scanner.disable_scoring_autoprojection();
+        }
+
        Ok(scanner.create_plan().await?)
    }

--- a/rust/lancedb/src/table/datafusion.rs
+++ b/rust/lancedb/src/table/datafusion.rs
@@ -2,6 +2,9 @@
 // SPDX-FileCopyrightText: Copyright The LanceDB Authors

 //! This module contains adapters to allow LanceDB tables to be used as DataFusion table providers.
+
+pub mod udtf;
+
 use std::{collections::HashMap, sync::Arc};

 use arrow_array::RecordBatch;
@@ -21,6 +24,8 @@ use crate::{
    query::{QueryExecutionOptions, QueryFilter, QueryRequest, Select},
    Result,
 };
+use arrow_schema::{DataType, Field};
+use lance_index::scalar::FullTextSearchQuery;

 /// Datafusion attempts to maintain batch metadata
 ///
@@ -135,19 +140,38 @@ impl ExecutionPlan for MetadataEraserExec {
 pub struct BaseTableAdapter {
    table: Arc<dyn BaseTable>,
    schema: Arc<ArrowSchema>,
+    fts_query: Option<FullTextSearchQuery>,
 }

 impl BaseTableAdapter {
    pub async fn try_new(table: Arc<dyn BaseTable>) -> Result<Self> {
-        let schema = Arc::new(
-            table
-                .schema()
-                .await?
-                .as_ref()
-                .clone()
-                .with_metadata(HashMap::default()),
-        );
-        Ok(Self { table, schema })
+        let schema = table
+            .schema()
+            .await?
+            .as_ref()
+            .clone()
+            .with_metadata(HashMap::default());
+
+        Ok(Self {
+            table,
+            schema: Arc::new(schema),
+            fts_query: None,
+        })
+    }
+
+    /// Create a new adapter with an FTS query applied.
+    pub fn with_fts_query(&self, fts_query: FullTextSearchQuery) -> Self {
+        // Add _score column to the schema
+        let score_field = Field::new("_score", DataType::Float32, true);
+        let mut fields = self.schema.fields().to_vec();
+        fields.push(Arc::new(score_field));
+        let schema = Arc::new(ArrowSchema::new(fields));
+
+        Self {
+            table: self.table.clone(),
+            schema,
+            fts_query: Some(fts_query),
+        }
    }
 }

@@ -172,7 +196,15 @@ impl TableProvider for BaseTableAdapter {
        filters: &[Expr],
        limit: Option<usize>,
    ) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
-        let mut query = QueryRequest::default();
+        // For FTS queries, disable auto-projection of _score to match DataFusion expectations
+        let disable_scoring = self.fts_query.is_some() && projection.is_some();
+
+        let mut query = QueryRequest {
+            full_text_search: self.fts_query.clone(),
+            disable_scoring_autoprojection: disable_scoring,
+            ..Default::default()
+        };
+
        if let Some(projection) = projection {
            let field_names = projection
                .iter()
--- a/rust/lancedb/src/table/datafusion/udtf.rs
+++ b/rust/lancedb/src/table/datafusion/udtf.rs
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+//! User-Defined Table Functions (UDTFs) for DataFusion integration
+
+pub mod fts;
--- a/rust/lancedb/src/table/datafusion/udtf/fts.rs
+++ b/rust/lancedb/src/table/datafusion/udtf/fts.rs