From fec8d58f06aa5d69777de9b10eb44b247ba0c607 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Thu, 12 Jun 2025 17:04:19 +0800 Subject: [PATCH] feat: support a bunch or FTS features in JS SDK (#2431) - operator for match query - slop for phrase query - boolean query ## Summary by CodeRabbit - **New Features** - Introduced support for boolean full-text search queries with AND/OR logic and occurrence conditions. - Added operator options for match and multi-match queries to control term combination logic. - Enabled phrase queries to specify proximity (slop) for flexible phrase matching. - Added new enumerations (`Operator`, `Occur`) and the `BooleanQuery` class for enhanced query expressiveness. - **Bug Fixes** - Improved validation and error handling for invalid operator and occurrence inputs in full-text queries. - **Tests** - Expanded test coverage with new cases for boolean queries and operator-based full-text searches. --------- Signed-off-by: BubbleCal --- docs/src/guides/sql_querying.md | 2 + docs/src/js/classes/BooleanQuery.md | 53 ++++++++++++++++++ docs/src/js/classes/MatchQuery.md | 3 + docs/src/js/classes/MultiMatchQuery.md | 3 + docs/src/js/classes/PhraseQuery.md | 11 +++- docs/src/js/enumerations/FullTextQueryType.md | 8 +++ docs/src/js/enumerations/Occur.md | 28 ++++++++++ docs/src/js/enumerations/Operator.md | 28 ++++++++++ docs/src/js/globals.md | 3 + docs/test/requirements.txt | 1 + nodejs/__test__/table.test.ts | 51 ++++++++++++++++- nodejs/lancedb/index.ts | 3 + nodejs/lancedb/query.ts | 56 ++++++++++++++++++- nodejs/src/query.rs | 49 ++++++++++++++-- 14 files changed, 291 insertions(+), 8 deletions(-) create mode 100644 docs/src/js/classes/BooleanQuery.md create mode 100644 docs/src/js/enumerations/Occur.md create mode 100644 docs/src/js/enumerations/Operator.md diff --git a/docs/src/guides/sql_querying.md b/docs/src/guides/sql_querying.md index 2da2a3c4..27cfa79a 100644 --- a/docs/src/guides/sql_querying.md +++ b/docs/src/guides/sql_querying.md @@ -42,6 +42,7 @@ duckdb.query("SELECT * FROM arrow_table") Have the required imports before doing any querying. === "Python" + ```python --8<-- "python/python/tests/docs/test_guide_tables.py:import-lancedb" --8<-- "python/python/tests/docs/test_guide_tables.py:import-session-context" @@ -51,6 +52,7 @@ Have the required imports before doing any querying. Register the table created with the Datafusion session context. === "Python" + ```python --8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic" ``` diff --git a/docs/src/js/classes/BooleanQuery.md b/docs/src/js/classes/BooleanQuery.md new file mode 100644 index 00000000..7925f473 --- /dev/null +++ b/docs/src/js/classes/BooleanQuery.md @@ -0,0 +1,53 @@ +[**@lancedb/lancedb**](../README.md) • **Docs** + +*** + +[@lancedb/lancedb](../globals.md) / BooleanQuery + +# Class: BooleanQuery + +Represents a full-text query interface. +This interface defines the structure and behavior for full-text queries, +including methods to retrieve the query type and convert the query to a dictionary format. + +## Implements + +- [`FullTextQuery`](../interfaces/FullTextQuery.md) + +## Constructors + +### new BooleanQuery() + +```ts +new BooleanQuery(queries): BooleanQuery +``` + +Creates an instance of BooleanQuery. + +#### Parameters + +* **queries**: [[`Occur`](../enumerations/Occur.md), [`FullTextQuery`](../interfaces/FullTextQuery.md)][] + An array of (Occur, FullTextQuery objects) to combine. + Occur specifies whether the query must match, or should match. + +#### Returns + +[`BooleanQuery`](BooleanQuery.md) + +## Methods + +### queryType() + +```ts +queryType(): FullTextQueryType +``` + +The type of the full-text query. + +#### Returns + +[`FullTextQueryType`](../enumerations/FullTextQueryType.md) + +#### Implementation of + +[`FullTextQuery`](../interfaces/FullTextQuery.md).[`queryType`](../interfaces/FullTextQuery.md#querytype) diff --git a/docs/src/js/classes/MatchQuery.md b/docs/src/js/classes/MatchQuery.md index aa936930..e69e47ca 100644 --- a/docs/src/js/classes/MatchQuery.md +++ b/docs/src/js/classes/MatchQuery.md @@ -40,6 +40,7 @@ Creates an instance of MatchQuery. - `boost`: The boost factor for the query (default is 1.0). - `fuzziness`: The fuzziness level for the query (default is 0). - `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50). + - `operator`: The logical operator to use for combining terms in the query (default is "OR"). * **options.boost?**: `number` @@ -47,6 +48,8 @@ Creates an instance of MatchQuery. * **options.maxExpansions?**: `number` +* **options.operator?**: [`Operator`](../enumerations/Operator.md) + #### Returns [`MatchQuery`](MatchQuery.md) diff --git a/docs/src/js/classes/MultiMatchQuery.md b/docs/src/js/classes/MultiMatchQuery.md index dca5d128..f6c62036 100644 --- a/docs/src/js/classes/MultiMatchQuery.md +++ b/docs/src/js/classes/MultiMatchQuery.md @@ -38,9 +38,12 @@ Creates an instance of MultiMatchQuery. * **options?** Optional parameters for the multi-match query. - `boosts`: An array of boost factors for each column (default is 1.0 for all). + - `operator`: The logical operator to use for combining terms in the query (default is "OR"). * **options.boosts?**: `number`[] +* **options.operator?**: [`Operator`](../enumerations/Operator.md) + #### Returns [`MultiMatchQuery`](MultiMatchQuery.md) diff --git a/docs/src/js/classes/PhraseQuery.md b/docs/src/js/classes/PhraseQuery.md index 10315de0..48214a5d 100644 --- a/docs/src/js/classes/PhraseQuery.md +++ b/docs/src/js/classes/PhraseQuery.md @@ -19,7 +19,10 @@ including methods to retrieve the query type and convert the query to a dictiona ### new PhraseQuery() ```ts -new PhraseQuery(query, column): PhraseQuery +new PhraseQuery( + query, + column, + options?): PhraseQuery ``` Creates an instance of `PhraseQuery`. @@ -32,6 +35,12 @@ Creates an instance of `PhraseQuery`. * **column**: `string` The name of the column to search within. +* **options?** + Optional parameters for the phrase query. + - `slop`: The maximum number of intervening unmatched positions allowed between words in the phrase (default is 0). + +* **options.slop?**: `number` + #### Returns [`PhraseQuery`](PhraseQuery.md) diff --git a/docs/src/js/enumerations/FullTextQueryType.md b/docs/src/js/enumerations/FullTextQueryType.md index baec0d51..7a79b296 100644 --- a/docs/src/js/enumerations/FullTextQueryType.md +++ b/docs/src/js/enumerations/FullTextQueryType.md @@ -15,6 +15,14 @@ Enum representing the types of full-text queries supported. ## Enumeration Members +### Boolean + +```ts +Boolean: "boolean"; +``` + +*** + ### Boost ```ts diff --git a/docs/src/js/enumerations/Occur.md b/docs/src/js/enumerations/Occur.md new file mode 100644 index 00000000..5e84958b --- /dev/null +++ b/docs/src/js/enumerations/Occur.md @@ -0,0 +1,28 @@ +[**@lancedb/lancedb**](../README.md) • **Docs** + +*** + +[@lancedb/lancedb](../globals.md) / Occur + +# Enumeration: Occur + +Enum representing the occurrence of terms in full-text queries. + +- `Must`: The term must be present in the document. +- `Should`: The term should contribute to the document score, but is not required. + +## Enumeration Members + +### Must + +```ts +Must: "MUST"; +``` + +*** + +### Should + +```ts +Should: "SHOULD"; +``` diff --git a/docs/src/js/enumerations/Operator.md b/docs/src/js/enumerations/Operator.md new file mode 100644 index 00000000..86fd382d --- /dev/null +++ b/docs/src/js/enumerations/Operator.md @@ -0,0 +1,28 @@ +[**@lancedb/lancedb**](../README.md) • **Docs** + +*** + +[@lancedb/lancedb](../globals.md) / Operator + +# Enumeration: Operator + +Enum representing the logical operators used in full-text queries. + +- `And`: All terms must match. +- `Or`: At least one term must match. + +## Enumeration Members + +### And + +```ts +And: "AND"; +``` + +*** + +### Or + +```ts +Or: "OR"; +``` diff --git a/docs/src/js/globals.md b/docs/src/js/globals.md index e555e2ab..caf73804 100644 --- a/docs/src/js/globals.md +++ b/docs/src/js/globals.md @@ -12,9 +12,12 @@ ## Enumerations - [FullTextQueryType](enumerations/FullTextQueryType.md) +- [Occur](enumerations/Occur.md) +- [Operator](enumerations/Operator.md) ## Classes +- [BooleanQuery](classes/BooleanQuery.md) - [BoostQuery](classes/BoostQuery.md) - [Connection](classes/Connection.md) - [Index](classes/Index.md) diff --git a/docs/test/requirements.txt b/docs/test/requirements.txt index bbccf6b8..3e7a8611 100644 --- a/docs/test/requirements.txt +++ b/docs/test/requirements.txt @@ -7,3 +7,4 @@ tantivy==0.20.1 --extra-index-url https://download.pytorch.org/whl/cpu torch polars>=0.19, <=1.3.0 +datafusion diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts index 1bad17fe..5b4f9d86 100644 --- a/nodejs/__test__/table.test.ts +++ b/nodejs/__test__/table.test.ts @@ -33,7 +33,12 @@ import { register, } from "../lancedb/embedding"; import { Index } from "../lancedb/indices"; -import { instanceOfFullTextQuery } from "../lancedb/query"; +import { + BooleanQuery, + Occur, + Operator, + instanceOfFullTextQuery, +} from "../lancedb/query"; import exp = require("constants"); describe.each([arrow15, arrow16, arrow17, arrow18])( @@ -1531,6 +1536,18 @@ describe.each([arrow15, arrow16, arrow17, arrow18])( const results = await table.search("hello").toArray(); expect(results[0].text).toBe(data[0].text); + + const results2 = await table + .search(new MatchQuery("hello world", "text")) + .toArray(); + expect(results2.length).toBe(2); + + const results3 = await table + .search( + new MatchQuery("hello world", "text", { operator: Operator.And }), + ) + .toArray(); + expect(results3.length).toBe(1); }); test("full text search without lowercase", async () => { @@ -1609,6 +1626,38 @@ describe.each([arrow15, arrow16, arrow17, arrow18])( expect(resultSet.has("food")).toBe(true); }); + test("full text search boolean query", async () => { + const db = await connect(tmpDir.name); + const data = [ + { text: "hello world", vector: [0.1, 0.2, 0.3] }, + { text: "goodbye world", vector: [0.4, 0.5, 0.6] }, + ]; + const table = await db.createTable("test", data); + await table.createIndex("text", { + config: Index.fts({ withPosition: false }), + }); + + const shouldResults = await table + .search( + new BooleanQuery([ + [Occur.Should, new MatchQuery("hello", "text")], + [Occur.Should, new MatchQuery("goodbye", "text")], + ]), + ) + .toArray(); + expect(shouldResults.length).toBe(2); + + const mustResults = await table + .search( + new BooleanQuery([ + [Occur.Must, new MatchQuery("hello", "text")], + [Occur.Must, new MatchQuery("world", "text")], + ]), + ) + .toArray(); + expect(mustResults.length).toBe(1); + }); + test.each([ [0.4, 0.5, 0.599], // number[] Float32Array.of(0.4, 0.5, 0.599), // Float32Array diff --git a/nodejs/lancedb/index.ts b/nodejs/lancedb/index.ts index 9565191f..0750a48c 100644 --- a/nodejs/lancedb/index.ts +++ b/nodejs/lancedb/index.ts @@ -64,7 +64,10 @@ export { PhraseQuery, BoostQuery, MultiMatchQuery, + BooleanQuery, FullTextQueryType, + Operator, + Occur, } from "./query"; export { diff --git a/nodejs/lancedb/query.ts b/nodejs/lancedb/query.ts index 80c221f1..b544faa1 100644 --- a/nodejs/lancedb/query.ts +++ b/nodejs/lancedb/query.ts @@ -762,6 +762,29 @@ export enum FullTextQueryType { MatchPhrase = "match_phrase", Boost = "boost", MultiMatch = "multi_match", + Boolean = "boolean", +} + +/** + * Enum representing the logical operators used in full-text queries. + * + * - `And`: All terms must match. + * - `Or`: At least one term must match. + */ +export enum Operator { + And = "AND", + Or = "OR", +} + +/** + * Enum representing the occurrence of terms in full-text queries. + * + * - `Must`: The term must be present in the document. + * - `Should`: The term should contribute to the document score, but is not required. + */ +export enum Occur { + Must = "MUST", + Should = "SHOULD", } /** @@ -791,6 +814,7 @@ export function instanceOfFullTextQuery(obj: any): obj is FullTextQuery { export class MatchQuery implements FullTextQuery { /** @ignore */ public readonly inner: JsFullTextQuery; + /** * Creates an instance of MatchQuery. * @@ -800,6 +824,7 @@ export class MatchQuery implements FullTextQuery { * - `boost`: The boost factor for the query (default is 1.0). * - `fuzziness`: The fuzziness level for the query (default is 0). * - `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50). + * - `operator`: The logical operator to use for combining terms in the query (default is "OR"). */ constructor( query: string, @@ -808,6 +833,7 @@ export class MatchQuery implements FullTextQuery { boost?: number; fuzziness?: number; maxExpansions?: number; + operator?: Operator; }, ) { let fuzziness = options?.fuzziness; @@ -820,6 +846,7 @@ export class MatchQuery implements FullTextQuery { options?.boost ?? 1.0, fuzziness, options?.maxExpansions ?? 50, + options?.operator ?? Operator.Or, ); } @@ -836,9 +863,11 @@ export class PhraseQuery implements FullTextQuery { * * @param query - The phrase to search for in the specified column. * @param column - The name of the column to search within. + * @param options - Optional parameters for the phrase query. + * - `slop`: The maximum number of intervening unmatched positions allowed between words in the phrase (default is 0). */ - constructor(query: string, column: string) { - this.inner = JsFullTextQuery.phraseQuery(query, column); + constructor(query: string, column: string, options?: { slop?: number }) { + this.inner = JsFullTextQuery.phraseQuery(query, column, options?.slop ?? 0); } queryType(): FullTextQueryType { @@ -889,18 +918,21 @@ export class MultiMatchQuery implements FullTextQuery { * @param columns - An array of column names to search within. * @param options - Optional parameters for the multi-match query. * - `boosts`: An array of boost factors for each column (default is 1.0 for all). + * - `operator`: The logical operator to use for combining terms in the query (default is "OR"). */ constructor( query: string, columns: string[], options?: { boosts?: number[]; + operator?: Operator; }, ) { this.inner = JsFullTextQuery.multiMatchQuery( query, columns, options?.boosts, + options?.operator ?? Operator.Or, ); } @@ -908,3 +940,23 @@ export class MultiMatchQuery implements FullTextQuery { return FullTextQueryType.MultiMatch; } } + +export class BooleanQuery implements FullTextQuery { + /** @ignore */ + public readonly inner: JsFullTextQuery; + /** + * Creates an instance of BooleanQuery. + * + * @param queries - An array of (Occur, FullTextQuery objects) to combine. + * Occur specifies whether the query must match, or should match. + */ + constructor(queries: [Occur, FullTextQuery][]) { + this.inner = JsFullTextQuery.booleanQuery( + queries.map(([occur, query]) => [occur, query.inner]), + ); + } + + queryType(): FullTextQueryType { + return FullTextQueryType.Boolean; + } +} diff --git a/nodejs/src/query.rs b/nodejs/src/query.rs index 442ae79a..3e3208cf 100644 --- a/nodejs/src/query.rs +++ b/nodejs/src/query.rs @@ -4,7 +4,8 @@ use std::sync::Arc; use lancedb::index::scalar::{ - BoostQuery, FtsQuery, FullTextSearchQuery, MatchQuery, MultiMatchQuery, PhraseQuery, + BooleanQuery, BoostQuery, FtsQuery, FullTextSearchQuery, MatchQuery, MultiMatchQuery, Occur, + Operator, PhraseQuery, }; use lancedb::query::ExecutableQuery; use lancedb::query::Query as LanceDbQuery; @@ -308,6 +309,7 @@ impl JsFullTextQuery { boost: f64, fuzziness: Option, max_expansions: u32, + operator: String, ) -> napi::Result { Ok(Self { inner: MatchQuery::new(query) @@ -315,14 +317,22 @@ impl JsFullTextQuery { .with_boost(boost as f32) .with_fuzziness(fuzziness) .with_max_expansions(max_expansions as usize) + .with_operator( + Operator::try_from(operator.as_str()).map_err(|e| { + napi::Error::from_reason(format!("Invalid operator: {}", e)) + })?, + ) .into(), }) } #[napi(factory)] - pub fn phrase_query(query: String, column: String) -> napi::Result { + pub fn phrase_query(query: String, column: String, slop: u32) -> napi::Result { Ok(Self { - inner: PhraseQuery::new(query).with_column(Some(column)).into(), + inner: PhraseQuery::new(query) + .with_column(Some(column)) + .with_slop(slop) + .into(), }) } @@ -348,6 +358,7 @@ impl JsFullTextQuery { query: String, columns: Vec, boosts: Option>, + operator: String, ) -> napi::Result { let q = match boosts { Some(boosts) => MultiMatchQuery::try_new(query, columns) @@ -358,7 +369,37 @@ impl JsFullTextQuery { napi::Error::from_reason(format!("Failed to create multi match query: {}", e)) })?; - Ok(Self { inner: q.into() }) + let operator = Operator::try_from(operator.as_str()).map_err(|e| { + napi::Error::from_reason(format!("Invalid operator for multi match query: {}", e)) + })?; + + Ok(Self { + inner: q.with_operator(operator).into(), + }) + } + + #[napi(factory)] + pub fn boolean_query(queries: Vec<(String, &JsFullTextQuery)>) -> napi::Result { + let mut sub_queries = Vec::with_capacity(queries.len()); + for (occur, q) in queries { + let occur = Occur::try_from(occur.as_str()) + .map_err(|e| napi::Error::from_reason(e.to_string()))?; + sub_queries.push((occur, q.inner.clone())); + } + Ok(Self { + inner: BooleanQuery::new(sub_queries).into(), + }) + } + + #[napi(getter)] + pub fn query_type(&self) -> String { + match self.inner { + FtsQuery::Match(_) => "match".to_string(), + FtsQuery::Phrase(_) => "phrase".to_string(), + FtsQuery::Boost(_) => "boost".to_string(), + FtsQuery::MultiMatch(_) => "multi_match".to_string(), + FtsQuery::Boolean(_) => "boolean".to_string(), + } } }