diff --git a/docs/src/fts.md b/docs/src/fts.md index 50e1cebb..37983f3d 100644 --- a/docs/src/fts.md +++ b/docs/src/fts.md @@ -62,7 +62,7 @@ Consider that we have a LanceDB table named `my_table`, whose string column `tex }); await tbl - .search("puppy") + .search("puppy", queryType="fts") .select(["text"]) .limit(10) .toArray(); @@ -205,7 +205,7 @@ table.create_fts_index(["text_field"], use_tantivy=True, ordering_field_names=[" ## Phrase queries vs. terms queries !!! warning "Warn" - Phrase queries are available for only Tantivy-based FTS + Lance-based FTS doesn't support queries combining by boolean operators `OR`, `AND`. For full-text search you can specify either a **phrase** query like `"the old man and the sea"`, or a **terms** search query like `"(Old AND Man) AND Sea"`. For more details on the terms diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts index e1f254c0..3c979031 100644 --- a/nodejs/__test__/table.test.ts +++ b/nodejs/__test__/table.test.ts @@ -844,6 +844,38 @@ describe.each([arrow13, arrow14, arrow15, arrow16, arrow17])( expect(results[0].text).toBe(data[0].text); }); + test("full text search without positions", async () => { + const db = await connect(tmpDir.name); + const data = [ + { text: "hello world", vector: [0.1, 0.2, 0.3] }, + { text: "goodbye world", vector: [0.4, 0.5, 0.6] }, + ]; + const table = await db.createTable("test", data); + await table.createIndex("text", { + config: Index.fts({ withPositions: false }), + }); + + const results = await table.search("hello").toArray(); + expect(results[0].text).toBe(data[0].text); + }); + + test("full text search phrase query", async () => { + const db = await connect(tmpDir.name); + const data = [ + { text: "hello world", vector: [0.1, 0.2, 0.3] }, + { text: "goodbye world", vector: [0.4, 0.5, 0.6] }, + ]; + const table = await db.createTable("test", data); + await table.createIndex("text", { + config: Index.fts(), + }); + + const results = await table.search("world").toArray(); + expect(results.length).toBe(2); + const phraseResults = await table.search('"hello world"').toArray(); + expect(phraseResults.length).toBe(1); + }); + test.each([ [0.4, 0.5, 0.599], // number[] Float32Array.of(0.4, 0.5, 0.599), // Float32Array diff --git a/nodejs/lancedb/indices.ts b/nodejs/lancedb/indices.ts index 8b1ab3e7..8eb20b71 100644 --- a/nodejs/lancedb/indices.ts +++ b/nodejs/lancedb/indices.ts @@ -113,6 +113,19 @@ export interface IvfPqOptions { sampleRate?: number; } +/** + * Options to create a full text search index + */ +export interface FtsOptions { + /** + * Whether to build the index with positions. + * True by default. + * If set to false, the index will not store the positions of the tokens in the text, + * which will make the index smaller and faster to build, but will not support phrase queries. + */ + withPositions?: boolean; +} + export class Index { private readonly inner: LanceDbIndex; private constructor(inner: LanceDbIndex) { @@ -211,8 +224,8 @@ export class Index { * * For now, the full text search index only supports English, and doesn't support phrase search. */ - static fts() { - return new Index(LanceDbIndex.fts()); + static fts(options?: Partial) { + return new Index(LanceDbIndex.fts(options?.withPositions)); } } diff --git a/nodejs/src/index.rs b/nodejs/src/index.rs index 21b212e2..56c68ae8 100644 --- a/nodejs/src/index.rs +++ b/nodejs/src/index.rs @@ -92,9 +92,13 @@ impl Index { } #[napi(factory)] - pub fn fts() -> Self { + pub fn fts(with_position: Option) -> Self { + let mut opts = FtsIndexBuilder::default(); + if let Some(with_position) = with_position { + opts = opts.with_position(with_position); + } Self { - inner: Mutex::new(Some(LanceDbIndex::FTS(FtsIndexBuilder::default()))), + inner: Mutex::new(Some(LanceDbIndex::FTS(opts))), } } } diff --git a/python/python/lancedb/index.py b/python/python/lancedb/index.py index 2e0c7b95..aab8948d 100644 --- a/python/python/lancedb/index.py +++ b/python/python/lancedb/index.py @@ -78,8 +78,8 @@ class FTS: For example, it works with `title`, `description`, `content`, etc. """ - def __init__(self): - self._inner = LanceDbIndex.fts() + def __init__(self, with_position: bool = True): + self._inner = LanceDbIndex.fts(with_position=with_position) class IvfPq: diff --git a/python/python/lancedb/remote/table.py b/python/python/lancedb/remote/table.py index 4f5f6a0c..c4d0ff0b 100644 --- a/python/python/lancedb/remote/table.py +++ b/python/python/lancedb/remote/table.py @@ -126,6 +126,7 @@ class RemoteTable(Table): column: str, *, replace: bool = False, + with_position: bool = True, ): data = { "column": column, diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 89cf1247..ec9f652e 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -468,6 +468,7 @@ class Table(ABC): ordering_field_names: Union[str, List[str]] = None, *, replace: bool = False, + with_position: bool = True, writer_heap_size: Optional[int] = 1024 * 1024 * 1024, tokenizer_name: str = "default", use_tantivy: bool = True, @@ -500,6 +501,12 @@ class Table(ABC): use_tantivy: bool, default True If True, use the legacy full-text search implementation based on tantivy. If False, use the new full-text search implementation based on lance-index. + with_position: bool, default True + Only available with use_tantivy=False + If False, do not store the positions of the terms in the text. + This can reduce the size of the index and improve indexing speed. + But it will not be possible to use phrase queries. + """ raise NotImplementedError @@ -1305,6 +1312,7 @@ class LanceTable(Table): ordering_field_names: Union[str, List[str]] = None, *, replace: bool = False, + with_position: bool = True, writer_heap_size: Optional[int] = 1024 * 1024 * 1024, tokenizer_name: str = "default", use_tantivy: bool = True, @@ -1318,7 +1326,10 @@ class LanceTable(Table): if exist: fs.delete_dir(path) self._dataset_mut.create_scalar_index( - field_names, index_type="INVERTED", replace=replace + field_names, + index_type="INVERTED", + replace=replace, + with_position=with_position, ) return diff --git a/python/python/tests/test_fts.py b/python/python/tests/test_fts.py index 54ba9cf4..ade569ac 100644 --- a/python/python/tests/test_fts.py +++ b/python/python/tests/test_fts.py @@ -140,8 +140,11 @@ def test_create_index_with_stemming(tmp_path, table): @pytest.mark.parametrize("use_tantivy", [True, False]) -def test_create_inverted_index(table, use_tantivy): - table.create_fts_index("text", use_tantivy=use_tantivy) +@pytest.mark.parametrize("with_position", [True, False]) +def test_create_inverted_index(table, use_tantivy, with_position): + if use_tantivy and not with_position: + pytest.skip("we don't support to build tantivy index without position") + table.create_fts_index("text", use_tantivy=use_tantivy, with_position=with_position) def test_populate_index(tmp_path, table): @@ -166,6 +169,40 @@ def test_search_fts(table, use_tantivy): assert len(results) == 5 +def test_search_fts_phrase_query(table): + table.create_fts_index("text", use_tantivy=False, with_position=False) + try: + phrase_results = table.search('"puppy runs"').limit(100).to_list() + assert False + except Exception: + pass + table.create_fts_index("text", use_tantivy=False, replace=True) + results = table.search("puppy").limit(100).to_list() + phrase_results = table.search('"puppy runs"').limit(100).to_list() + assert len(results) > len(phrase_results) + assert len(phrase_results) > 0 + + +@pytest.mark.asyncio +async def test_search_fts_phrase_query_async(async_table): + async_table = await async_table + await async_table.create_index("text", config=FTS(with_position=False)) + try: + phrase_results = ( + await async_table.query().nearest_to_text("puppy runs").limit(100).to_list() + ) + assert False + except Exception: + pass + await async_table.create_index("text", config=FTS()) + results = await async_table.query().nearest_to_text("puppy").limit(100).to_list() + phrase_results = ( + await async_table.query().nearest_to_text('"puppy runs"').limit(100).to_list() + ) + assert len(results) > len(phrase_results) + assert len(phrase_results) > 0 + + def test_search_fts_specify_column(table): table.create_fts_index("text", use_tantivy=False) table.create_fts_index("text2", use_tantivy=False) diff --git a/python/src/index.rs b/python/src/index.rs index 5a857561..58474b33 100644 --- a/python/src/index.rs +++ b/python/src/index.rs @@ -14,6 +14,7 @@ use std::sync::Mutex; +use lancedb::index::scalar::FtsIndexBuilder; use lancedb::{ index::{scalar::BTreeIndexBuilder, vector::IvfPqIndexBuilder, Index as LanceDbIndex}, DistanceType, @@ -100,10 +101,14 @@ impl Index { } #[staticmethod] - pub fn fts() -> PyResult { - Ok(Self { - inner: Mutex::new(Some(LanceDbIndex::FTS(Default::default()))), - }) + pub fn fts(with_position: Option) -> Self { + let mut opts = FtsIndexBuilder::default(); + if let Some(with_position) = with_position { + opts = opts.with_position(with_position); + } + Self { + inner: Mutex::new(Some(LanceDbIndex::FTS(opts))), + } } } diff --git a/rust/lancedb/src/index/scalar.rs b/rust/lancedb/src/index/scalar.rs index cf1e695e..553ad64f 100644 --- a/rust/lancedb/src/index/scalar.rs +++ b/rust/lancedb/src/index/scalar.rs @@ -51,9 +51,25 @@ pub struct LabelListIndexBuilder {} /// Builder for a full text search index /// /// A full text search index is an index on a string column that allows for full text search -#[derive(Debug, Clone, Default)] -pub struct FtsIndexBuilder {} +#[derive(Debug, Clone)] +pub struct FtsIndexBuilder { + pub(crate) with_position: bool, +} -impl FtsIndexBuilder {} +impl Default for FtsIndexBuilder { + fn default() -> Self { + Self { + with_position: true, + } + } +} + +impl FtsIndexBuilder { + /// Set the with_position flag + pub fn with_position(mut self, with_position: bool) -> Self { + self.with_position = with_position; + self + } +} pub use lance_index::scalar::FullTextSearchQuery; diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index 3e66d2c7..2244ef7f 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -52,6 +52,7 @@ use crate::arrow::IntoArrow; use crate::connection::NoData; use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry}; use crate::error::{Error, Result}; +use crate::index::scalar::FtsIndexBuilder; use crate::index::vector::{ IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex, }; @@ -1609,7 +1610,12 @@ impl NativeTable { Ok(()) } - async fn create_fts_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> { + async fn create_fts_index( + &self, + field: &Field, + fts_opts: FtsIndexBuilder, + replace: bool, + ) -> Result<()> { if !Self::supported_fts_data_type(field.data_type()) { return Err(Error::Schema { message: format!( @@ -1621,16 +1627,16 @@ impl NativeTable { } let mut dataset = self.dataset.get_mut().await?; - let lance_idx_params = lance_index::scalar::ScalarIndexParams { - force_index_type: Some(lance_index::scalar::ScalarIndexType::Inverted), + let fts_params = lance_index::scalar::InvertedIndexParams { + with_position: fts_opts.with_position, }; dataset .create_index( &[field.name()], - IndexType::Scalar, + IndexType::Inverted, None, - &lance_idx_params, - opts.replace, + &fts_params, + replace, ) .await?; Ok(()) @@ -1802,7 +1808,7 @@ impl TableInternal for NativeTable { Index::BTree(_) => self.create_btree_index(field, opts).await, Index::Bitmap(_) => self.create_bitmap_index(field, opts).await, Index::LabelList(_) => self.create_label_list_index(field, opts).await, - Index::FTS(_) => self.create_fts_index(field, opts).await, + Index::FTS(fts_opts) => self.create_fts_index(field, fts_opts, opts.replace).await, Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await, Index::IvfHnswPq(ivf_hnsw_pq) => { self.create_ivf_hnsw_pq_index(ivf_hnsw_pq, field, opts.replace)