diff --git a/Cargo.lock b/Cargo.lock index c43950d4..91c32b26 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2720,7 +2720,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" version = "0.25.3" -source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467" +source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247" dependencies = [ "rand 0.8.5", ] @@ -3708,7 +3708,7 @@ dependencies = [ [[package]] name = "lance" version = "0.25.3" -source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467" +source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247" dependencies = [ "arrow", "arrow-arith", @@ -3768,7 +3768,7 @@ dependencies = [ [[package]] name = "lance-arrow" version = "0.25.3" -source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467" +source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247" dependencies = [ "arrow-array", "arrow-buffer", @@ -3786,7 +3786,7 @@ dependencies = [ [[package]] name = "lance-core" version = "0.25.3" -source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467" +source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247" dependencies = [ "arrow-array", "arrow-buffer", @@ -3823,7 +3823,7 @@ dependencies = [ [[package]] name = "lance-datafusion" version = "0.25.3" -source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467" +source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247" dependencies = [ "arrow", "arrow-array", @@ -3851,7 +3851,7 @@ dependencies = [ [[package]] name = "lance-datagen" version = "0.25.3" -source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467" +source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247" dependencies = [ "arrow", "arrow-array", @@ -3867,7 +3867,7 @@ dependencies = [ [[package]] name = "lance-encoding" version = "0.25.3" -source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467" +source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247" dependencies = [ "arrayref", "arrow", @@ -3907,7 +3907,7 @@ dependencies = [ [[package]] name = "lance-file" version = "0.25.3" -source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467" +source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247" dependencies = [ "arrow-arith", "arrow-array", @@ -3942,7 +3942,7 @@ dependencies = [ [[package]] name = "lance-index" version = "0.25.3" -source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467" +source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247" dependencies = [ "arrow", "arrow-array", @@ -3996,7 +3996,7 @@ dependencies = [ [[package]] name = "lance-io" version = "0.25.3" -source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467" +source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247" dependencies = [ "arrow", "arrow-arith", @@ -4035,7 +4035,7 @@ dependencies = [ [[package]] name = "lance-linalg" version = "0.25.3" -source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467" +source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247" dependencies = [ "arrow-array", "arrow-ord", @@ -4059,7 +4059,7 @@ dependencies = [ [[package]] name = "lance-table" version = "0.25.3" -source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467" +source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247" dependencies = [ "arrow", "arrow-array", @@ -4099,7 +4099,7 @@ dependencies = [ [[package]] name = "lance-testing" version = "0.25.3" -source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467" +source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247" dependencies = [ "arrow-array", "arrow-schema", @@ -4110,7 +4110,7 @@ dependencies = [ [[package]] name = "lancedb" -version = "0.19.0-beta.4" +version = "0.19.0-beta.5" dependencies = [ "arrow", "arrow-array", @@ -4197,7 +4197,7 @@ dependencies = [ [[package]] name = "lancedb-node" -version = "0.19.0-beta.4" +version = "0.19.0-beta.5" dependencies = [ "arrow-array", "arrow-ipc", @@ -4222,7 +4222,7 @@ dependencies = [ [[package]] name = "lancedb-nodejs" -version = "0.19.0-beta.4" +version = "0.19.0-beta.5" dependencies = [ "arrow-array", "arrow-ipc", @@ -4240,7 +4240,7 @@ dependencies = [ [[package]] name = "lancedb-python" -version = "0.22.0-beta.4" +version = "0.22.0-beta.5" dependencies = [ "arrow", "env_logger", diff --git a/Cargo.toml b/Cargo.toml index 9be0b6d4..176c867b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,14 +23,14 @@ rust-version = "1.78.0" [workspace.dependencies] lance = { "version" = "=0.25.3", "features" = [ "dynamodb", -], tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" } -lance-io = { version = "=0.25.3", tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" } -lance-index = { version = "=0.25.3", tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" } -lance-linalg = { version = "=0.25.3", tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" } -lance-table = { version = "=0.25.3", tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" } -lance-testing = { version = "=0.25.3", tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" } -lance-datafusion = { version = "=0.25.3", tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" } -lance-encoding = { version = "=0.25.3", tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" } +], tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" } +lance-io = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" } +lance-index = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" } +lance-linalg = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" } +lance-table = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" } +lance-testing = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" } +lance-datafusion = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" } +lance-encoding = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" } # Note that this one does not include pyarrow arrow = { version = "54.1", optional = false } arrow-array = "54.1" diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts index b306126e..1c0eb083 100644 --- a/nodejs/__test__/table.test.ts +++ b/nodejs/__test__/table.test.ts @@ -1304,6 +1304,27 @@ describe.each([arrow15, arrow16, arrow17, arrow18])( expect(results[0].text).toBe(data[0].text); }); + test("full text index on list", async () => { + const db = await connect(tmpDir.name); + const data = [ + { text: ["lance database", "the", "search"], vector: [0.1, 0.2, 0.3] }, + { text: ["lance database"], vector: [0.4, 0.5, 0.6] }, + { text: ["lance", "search"], vector: [0.7, 0.8, 0.9] }, + { text: ["database", "search"], vector: [1.0, 1.1, 1.2] }, + { text: ["unrelated", "doc"], vector: [1.3, 1.4, 1.5] }, + ]; + const table = await db.createTable("test", data); + await table.createIndex("text", { + config: Index.fts(), + }); + + const results = await table.search("lance").toArray(); + expect(results.length).toBe(3); + + const results2 = await table.search('"lance database"').toArray(); + expect(results2.length).toBe(2); + }); + test("full text search without positions", async () => { const db = await connect(tmpDir.name); const data = [ diff --git a/python/python/tests/test_fts.py b/python/python/tests/test_fts.py index 7f27be75..daf45093 100644 --- a/python/python/tests/test_fts.py +++ b/python/python/tests/test_fts.py @@ -22,6 +22,7 @@ from lancedb.db import DBConnection from lancedb.index import FTS from lancedb.query import BoostQuery, MatchQuery, MultiMatchQuery, PhraseQuery import numpy as np +import pyarrow as pa import pandas as pd import pytest from utils import exception_output @@ -626,3 +627,32 @@ def test_language(mem_db: DBConnection): # Stop words -> no results results = table.search("la", query_type="fts").limit(5).to_list() assert len(results) == 0 + + +def test_fts_on_list(mem_db: DBConnection): + data = pa.table( + { + "text": [ + ["lance database", "the", "search"], + ["lance database"], + ["lance", "search"], + ["database", "search"], + ["unrelated", "doc"], + ], + "vector": [ + [1.0, 2.0, 3.0], + [4.0, 5.0, 6.0], + [7.0, 8.0, 9.0], + [10.0, 11.0, 12.0], + [13.0, 14.0, 15.0], + ], + } + ) + table = mem_db.create_table("test", data=data) + table.create_fts_index("text", use_tantivy=False) + + res = table.search("lance").limit(5).to_list() + assert len(res) == 3 + + res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list() + assert len(res) == 2 diff --git a/rust/lancedb/src/utils.rs b/rust/lancedb/src/utils.rs index 3f44bb0f..8d2304e3 100644 --- a/rust/lancedb/src/utils.rs +++ b/rust/lancedb/src/utils.rs @@ -158,7 +158,17 @@ pub fn supported_label_list_data_type(dtype: &DataType) -> bool { } pub fn supported_fts_data_type(dtype: &DataType) -> bool { - matches!(dtype, DataType::Utf8 | DataType::LargeUtf8) + supported_fts_data_type_impl(dtype, false) +} + +fn supported_fts_data_type_impl(dtype: &DataType, in_list: bool) -> bool { + match (dtype, in_list) { + (DataType::Utf8 | DataType::LargeUtf8, _) => true, + (DataType::List(field) | DataType::LargeList(field), false) => { + supported_fts_data_type_impl(field.data_type(), true) + } + _ => false, + } } pub fn supported_vector_data_type(dtype: &DataType) -> bool {