Compare commits

...

3 Commits

Author SHA1 Message Date
Lance Release
273ba18426 Bump version: 0.25.3-beta.4 → 0.25.3-beta.5 2025-10-31 17:07:31 +00:00
LuQQiu
8b94308cf2 feat: add fts udtf in sql (#2755)
Support FTS feature parity in SQL to match current Python API
capability.
Add `.to_json()` method to FTS query classes to enable usage with SQL
`fts()` UDTF.
Related: https://github.com/lancedb/blog-lancedb/pull/147

query = MatchQuery("puppy", "text", fuzziness=2)
result = client.execute(f"SELECT * FROM fts('table',
'{query.to_json()}')")

---------

Co-authored-by: Claude <noreply@anthropic.com>
2025-10-31 10:06:19 -07:00
Lance Release
0b7b27481e Bump version: 0.22.3-beta.3 → 0.22.3-beta.4 2025-10-31 01:14:39 +00:00
29 changed files with 2296 additions and 34 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.22.3-beta.3"
current_version = "0.22.3-beta.4"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

6
Cargo.lock generated
View File

@@ -4684,7 +4684,7 @@ dependencies = [
[[package]]
name = "lancedb"
version = "0.22.3-beta.3"
version = "0.22.3-beta.4"
dependencies = [
"ahash",
"anyhow",
@@ -4781,7 +4781,7 @@ dependencies = [
[[package]]
name = "lancedb-nodejs"
version = "0.22.3-beta.3"
version = "0.22.3-beta.4"
dependencies = [
"arrow-array",
"arrow-ipc",
@@ -4801,7 +4801,7 @@ dependencies = [
[[package]]
name = "lancedb-python"
version = "0.25.3-beta.3"
version = "0.25.3-beta.4"
dependencies = [
"arrow",
"async-trait",

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.22.3-beta.3</version>
<version>0.22.3-beta.4</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.22.3-beta.3</version>
<version>0.22.3-beta.4</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.22.3-beta.3</version>
<version>0.22.3-beta.4</version>
<packaging>pom</packaging>
<name>${project.artifactId}</name>
<description>LanceDB Java SDK Parent POM</description>

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.22.3-beta.3"
version = "0.22.3-beta.4"
license.workspace = true
description.workspace = true
repository.workspace = true

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.22.3-beta.3",
"version": "0.22.3-beta.4",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.22.3-beta.3",
"version": "0.22.3-beta.4",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.22.3-beta.3",
"version": "0.22.3-beta.4",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.22.3-beta.3",
"version": "0.22.3-beta.4",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.22.3-beta.3",
"version": "0.22.3-beta.4",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.22.3-beta.3",
"version": "0.22.3-beta.4",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.22.3-beta.3",
"version": "0.22.3-beta.4",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.22.3-beta.3",
"version": "0.22.3-beta.4",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.22.3-beta.3",
"version": "0.22.3-beta.4",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.22.3-beta.3",
"version": "0.22.3-beta.4",
"cpu": [
"x64",
"arm64"

View File

@@ -11,7 +11,7 @@
"ann"
],
"private": false,
"version": "0.22.3-beta.3",
"version": "0.22.3-beta.4",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.25.3-beta.4"
current_version = "0.25.3-beta.5"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.25.3-beta.4"
version = "0.25.3-beta.5"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -339,3 +339,4 @@ class AsyncPermutationBuilder:
def async_permutation_builder(
table: Table, dest_table_name: str
) -> AsyncPermutationBuilder: ...
def fts_query_to_json(query: Any) -> str: ...

View File

@@ -37,7 +37,7 @@ from .rerankers.base import Reranker
from .rerankers.rrf import RRFReranker
from .rerankers.util import check_reranker_result
from .util import flatten_columns
from lancedb._lancedb import fts_query_to_json
from typing_extensions import Annotated
if TYPE_CHECKING:
@@ -124,6 +124,24 @@ class FullTextQuery(ABC):
"""
pass
def to_json(self) -> str:
"""
Convert the query to a JSON string.
Returns
-------
str
A JSON string representation of the query.
Examples
--------
>>> from lancedb.query import MatchQuery
>>> query = MatchQuery("puppy", "text", fuzziness=2)
>>> query.to_json()
'{"match":{"column":"text","terms":"puppy","boost":1.0,"fuzziness":2,"max_expansions":50,"operator":"Or","prefix_length":0}}'
"""
return fts_query_to_json(self)
def __and__(self, other: "FullTextQuery") -> "FullTextQuery":
"""
Combine two queries with a logical AND operation.
@@ -288,6 +306,8 @@ class BooleanQuery(FullTextQuery):
----------
queries : list[tuple(Occur, FullTextQuery)]
The list of queries with their occurrence requirements.
Each tuple contains an Occur value (MUST, SHOULD, or MUST_NOT)
and a FullTextQuery to apply.
"""
queries: list[tuple[Occur, FullTextQuery]]

View File

@@ -20,7 +20,14 @@ from unittest import mock
import lancedb as ldb
from lancedb.db import DBConnection
from lancedb.index import FTS
from lancedb.query import BoostQuery, MatchQuery, MultiMatchQuery, PhraseQuery
from lancedb.query import (
BoostQuery,
MatchQuery,
MultiMatchQuery,
PhraseQuery,
BooleanQuery,
Occur,
)
import numpy as np
import pyarrow as pa
import pandas as pd
@@ -727,3 +734,146 @@ def test_fts_ngram(mem_db: DBConnection):
results = table.search("la", query_type="fts").limit(10).to_list()
assert len(results) == 2
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
def test_fts_query_to_json():
"""Test that FTS query to_json() produces valid JSON strings with exact format."""
# Test MatchQuery - basic
match_query = MatchQuery("hello world", "text")
json_str = match_query.to_json()
expected = (
'{"match":{"column":"text","terms":"hello world","boost":1.0,'
'"fuzziness":0,"max_expansions":50,"operator":"Or","prefix_length":0}}'
)
assert json_str == expected
# Test MatchQuery with options
match_query = MatchQuery("puppy", "text", fuzziness=2, boost=1.5, prefix_length=3)
json_str = match_query.to_json()
expected = (
'{"match":{"column":"text","terms":"puppy","boost":1.5,"fuzziness":2,'
'"max_expansions":50,"operator":"Or","prefix_length":3}}'
)
assert json_str == expected
# Test PhraseQuery
phrase_query = PhraseQuery("quick brown fox", "title")
json_str = phrase_query.to_json()
expected = '{"phrase":{"column":"title","terms":"quick brown fox","slop":0}}'
assert json_str == expected
# Test PhraseQuery with slop
phrase_query = PhraseQuery("quick brown", "title", slop=2)
json_str = phrase_query.to_json()
expected = '{"phrase":{"column":"title","terms":"quick brown","slop":2}}'
assert json_str == expected
# Test BooleanQuery with MUST
must_query = BooleanQuery(
[
(Occur.MUST, MatchQuery("puppy", "text")),
(Occur.MUST, MatchQuery("runs", "text")),
]
)
json_str = must_query.to_json()
expected = (
'{"boolean":{"should":[],"must":[{"match":{"column":"text","terms":"puppy",'
'"boost":1.0,"fuzziness":0,"max_expansions":50,"operator":"Or",'
'"prefix_length":0}},{"match":{"column":"text","terms":"runs","boost":1.0,'
'"fuzziness":0,"max_expansions":50,"operator":"Or","prefix_length":0}}],'
'"must_not":[]}}'
)
assert json_str == expected
# Test BooleanQuery with SHOULD
should_query = BooleanQuery(
[
(Occur.SHOULD, MatchQuery("cat", "text")),
(Occur.SHOULD, MatchQuery("dog", "text")),
]
)
json_str = should_query.to_json()
expected = (
'{"boolean":{"should":[{"match":{"column":"text","terms":"cat","boost":1.0,'
'"fuzziness":0,"max_expansions":50,"operator":"Or","prefix_length":0}},'
'{"match":{"column":"text","terms":"dog","boost":1.0,"fuzziness":0,'
'"max_expansions":50,"operator":"Or","prefix_length":0}}],"must":[],'
'"must_not":[]}}'
)
assert json_str == expected
# Test BooleanQuery with MUST_NOT
must_not_query = BooleanQuery(
[
(Occur.MUST, MatchQuery("puppy", "text")),
(Occur.MUST_NOT, MatchQuery("training", "text")),
]
)
json_str = must_not_query.to_json()
expected = (
'{"boolean":{"should":[],"must":[{"match":{"column":"text","terms":"puppy",'
'"boost":1.0,"fuzziness":0,"max_expansions":50,"operator":"Or",'
'"prefix_length":0}}],"must_not":[{"match":{"column":"text",'
'"terms":"training","boost":1.0,"fuzziness":0,"max_expansions":50,'
'"operator":"Or","prefix_length":0}}]}}'
)
assert json_str == expected
# Test BoostQuery
positive = MatchQuery("puppy", "text")
negative = MatchQuery("training", "text")
boost_query = BoostQuery(positive, negative, negative_boost=0.3)
json_str = boost_query.to_json()
expected = (
'{"boost":{"positive":{"match":{"column":"text","terms":"puppy",'
'"boost":1.0,"fuzziness":0,"max_expansions":50,"operator":"Or",'
'"prefix_length":0}},"negative":{"match":{"column":"text",'
'"terms":"training","boost":1.0,"fuzziness":0,"max_expansions":50,'
'"operator":"Or","prefix_length":0}},"negative_boost":0.3}}'
)
assert json_str == expected
# Test MultiMatchQuery
multi_match = MultiMatchQuery("python", ["tags", "title"])
json_str = multi_match.to_json()
expected = (
'{"multi_match":{"query":"python","columns":["tags","title"],'
'"boost":[1.0,1.0]}}'
)
assert json_str == expected
# Test complex nested BooleanQuery
inner1 = BooleanQuery(
[
(Occur.MUST, MatchQuery("python", "tags")),
(Occur.MUST, MatchQuery("tutorial", "title")),
]
)
inner2 = BooleanQuery(
[
(Occur.MUST, MatchQuery("rust", "tags")),
(Occur.MUST, MatchQuery("guide", "title")),
]
)
complex_query = BooleanQuery(
[
(Occur.SHOULD, inner1),
(Occur.SHOULD, inner2),
]
)
json_str = complex_query.to_json()
expected = (
'{"boolean":{"should":[{"boolean":{"should":[],"must":[{"match":'
'{"column":"tags","terms":"python","boost":1.0,"fuzziness":0,'
'"max_expansions":50,"operator":"Or","prefix_length":0}},{"match":'
'{"column":"title","terms":"tutorial","boost":1.0,"fuzziness":0,'
'"max_expansions":50,"operator":"Or","prefix_length":0}}],"must_not":[]}}'
',{"boolean":{"should":[],"must":[{"match":{"column":"tags",'
'"terms":"rust","boost":1.0,"fuzziness":0,"max_expansions":50,'
'"operator":"Or","prefix_length":0}},{"match":{"column":"title",'
'"terms":"guide","boost":1.0,"fuzziness":0,"max_expansions":50,'
'"operator":"Or","prefix_length":0}}],"must_not":[]}}],"must":[],'
'"must_not":[]}}'
)
assert json_str == expected

View File

@@ -55,6 +55,7 @@ pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(connect, m)?)?;
m.add_function(wrap_pyfunction!(permutation::async_permutation_builder, m)?)?;
m.add_function(wrap_pyfunction!(util::validate_table_name, m)?)?;
m.add_function(wrap_pyfunction!(query::fts_query_to_json, m)?)?;
m.add("__version__", env!("CARGO_PKG_VERSION"))?;
Ok(())
}

View File

@@ -23,6 +23,7 @@ use lancedb::query::{
};
use lancedb::table::AnyQuery;
use pyo3::prelude::{PyAnyMethods, PyDictMethods};
use pyo3::pyfunction;
use pyo3::pymethods;
use pyo3::types::PyList;
use pyo3::types::{PyDict, PyString};
@@ -982,3 +983,15 @@ impl HybridQuery {
req
}
}
/// Convert a Python FTS query to JSON string
#[pyfunction]
pub fn fts_query_to_json(query_obj: &Bound<'_, PyAny>) -> PyResult<String> {
let wrapped: PyLanceDB<FtsQuery> = query_obj.extract()?;
lancedb::table::datafusion::udtf::fts::to_json(&wrapped.0).map_err(|e| {
PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
"Failed to serialize FTS query to JSON: {}",
e
))
})
}

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.22.3-beta.3"
version = "0.22.3-beta.4"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true

View File

@@ -667,6 +667,12 @@ pub struct QueryRequest {
/// Configure how query results are normalized when doing hybrid search
pub norm: Option<NormalizeMethod>,
/// If set to true, disables automatic projection of scoring columns (_score, _distance).
/// When disabled, these columns are only included if explicitly requested in the projection.
///
/// By default, this is false (scoring columns are auto-projected for backward compatibility).
pub disable_scoring_autoprojection: bool,
}
impl Default for QueryRequest {
@@ -682,6 +688,7 @@ impl Default for QueryRequest {
prefilter: true,
reranker: None,
norm: None,
disable_scoring_autoprojection: false,
}
}
}

View File

@@ -2391,6 +2391,10 @@ impl BaseTable for NativeTable {
scanner.distance_metric(distance_type.into());
}
if query.base.disable_scoring_autoprojection {
scanner.disable_scoring_autoprojection();
}
Ok(scanner.create_plan().await?)
}

View File

@@ -2,6 +2,9 @@
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! This module contains adapters to allow LanceDB tables to be used as DataFusion table providers.
pub mod udtf;
use std::{collections::HashMap, sync::Arc};
use arrow_array::RecordBatch;
@@ -21,6 +24,8 @@ use crate::{
query::{QueryExecutionOptions, QueryFilter, QueryRequest, Select},
Result,
};
use arrow_schema::{DataType, Field};
use lance_index::scalar::FullTextSearchQuery;
/// Datafusion attempts to maintain batch metadata
///
@@ -135,19 +140,38 @@ impl ExecutionPlan for MetadataEraserExec {
pub struct BaseTableAdapter {
table: Arc<dyn BaseTable>,
schema: Arc<ArrowSchema>,
fts_query: Option<FullTextSearchQuery>,
}
impl BaseTableAdapter {
pub async fn try_new(table: Arc<dyn BaseTable>) -> Result<Self> {
let schema = Arc::new(
table
.schema()
.await?
.as_ref()
.clone()
.with_metadata(HashMap::default()),
);
Ok(Self { table, schema })
let schema = table
.schema()
.await?
.as_ref()
.clone()
.with_metadata(HashMap::default());
Ok(Self {
table,
schema: Arc::new(schema),
fts_query: None,
})
}
/// Create a new adapter with an FTS query applied.
pub fn with_fts_query(&self, fts_query: FullTextSearchQuery) -> Self {
// Add _score column to the schema
let score_field = Field::new("_score", DataType::Float32, true);
let mut fields = self.schema.fields().to_vec();
fields.push(Arc::new(score_field));
let schema = Arc::new(ArrowSchema::new(fields));
Self {
table: self.table.clone(),
schema,
fts_query: Some(fts_query),
}
}
}
@@ -172,7 +196,15 @@ impl TableProvider for BaseTableAdapter {
filters: &[Expr],
limit: Option<usize>,
) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
let mut query = QueryRequest::default();
// For FTS queries, disable auto-projection of _score to match DataFusion expectations
let disable_scoring = self.fts_query.is_some() && projection.is_some();
let mut query = QueryRequest {
full_text_search: self.fts_query.clone(),
disable_scoring_autoprojection: disable_scoring,
..Default::default()
};
if let Some(projection) = projection {
let field_names = projection
.iter()

View File

@@ -0,0 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! User-Defined Table Functions (UDTFs) for DataFusion integration
pub mod fts;

File diff suppressed because it is too large Load Diff