Compare commits

..

2 Commits

Author SHA1 Message Date
Lance Release
27d9e5c596 Bump version: 0.22.0-beta.5 → 0.22.0-beta.6 2025-04-08 06:16:14 +00:00
BubbleCal
ec8271931f feat: support to create FTS index on list of strings (#2317)
<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **Chores**
- Updated internal library dependencies to the latest beta version for
improved system stability.
- **Tests**
- Added automated tests to validate full-text search functionality on
list-based text fields.
- **Refactor**
- Enhanced the search processing logic to provide robust support for
list-type text data, ensuring more reliable results.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-04-08 14:12:35 +08:00
9 changed files with 89 additions and 156 deletions

34
Cargo.lock generated
View File

@@ -2720,7 +2720,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "fsst"
version = "0.25.3"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
dependencies = [
"rand 0.8.5",
]
@@ -3708,7 +3708,7 @@ dependencies = [
[[package]]
name = "lance"
version = "0.25.3"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
dependencies = [
"arrow",
"arrow-arith",
@@ -3768,7 +3768,7 @@ dependencies = [
[[package]]
name = "lance-arrow"
version = "0.25.3"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -3786,7 +3786,7 @@ dependencies = [
[[package]]
name = "lance-core"
version = "0.25.3"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -3823,7 +3823,7 @@ dependencies = [
[[package]]
name = "lance-datafusion"
version = "0.25.3"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
dependencies = [
"arrow",
"arrow-array",
@@ -3851,7 +3851,7 @@ dependencies = [
[[package]]
name = "lance-datagen"
version = "0.25.3"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
dependencies = [
"arrow",
"arrow-array",
@@ -3867,7 +3867,7 @@ dependencies = [
[[package]]
name = "lance-encoding"
version = "0.25.3"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
dependencies = [
"arrayref",
"arrow",
@@ -3907,7 +3907,7 @@ dependencies = [
[[package]]
name = "lance-file"
version = "0.25.3"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -3942,7 +3942,7 @@ dependencies = [
[[package]]
name = "lance-index"
version = "0.25.3"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
dependencies = [
"arrow",
"arrow-array",
@@ -3996,7 +3996,7 @@ dependencies = [
[[package]]
name = "lance-io"
version = "0.25.3"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
dependencies = [
"arrow",
"arrow-arith",
@@ -4035,7 +4035,7 @@ dependencies = [
[[package]]
name = "lance-linalg"
version = "0.25.3"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
dependencies = [
"arrow-array",
"arrow-ord",
@@ -4059,7 +4059,7 @@ dependencies = [
[[package]]
name = "lance-table"
version = "0.25.3"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
dependencies = [
"arrow",
"arrow-array",
@@ -4099,7 +4099,7 @@ dependencies = [
[[package]]
name = "lance-testing"
version = "0.25.3"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.2#e0d3179bcc6e6ce5f9c5fcfeb4398789a7005467"
source = "git+https://github.com/lancedb/lance?tag=v0.25.3-beta.4#236c8f986ab9e2d478d0754fab6e8d2643c31247"
dependencies = [
"arrow-array",
"arrow-schema",
@@ -4110,7 +4110,7 @@ dependencies = [
[[package]]
name = "lancedb"
version = "0.19.0-beta.4"
version = "0.19.0-beta.5"
dependencies = [
"arrow",
"arrow-array",
@@ -4197,7 +4197,7 @@ dependencies = [
[[package]]
name = "lancedb-node"
version = "0.19.0-beta.4"
version = "0.19.0-beta.5"
dependencies = [
"arrow-array",
"arrow-ipc",
@@ -4222,7 +4222,7 @@ dependencies = [
[[package]]
name = "lancedb-nodejs"
version = "0.19.0-beta.4"
version = "0.19.0-beta.5"
dependencies = [
"arrow-array",
"arrow-ipc",
@@ -4240,7 +4240,7 @@ dependencies = [
[[package]]
name = "lancedb-python"
version = "0.22.0-beta.4"
version = "0.22.0-beta.5"
dependencies = [
"arrow",
"env_logger",

View File

@@ -23,14 +23,14 @@ rust-version = "1.78.0"
[workspace.dependencies]
lance = { "version" = "=0.25.3", "features" = [
"dynamodb",
], tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" }
lance-io = { version = "=0.25.3", tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" }
lance-index = { version = "=0.25.3", tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" }
lance-linalg = { version = "=0.25.3", tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" }
lance-table = { version = "=0.25.3", tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" }
lance-testing = { version = "=0.25.3", tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" }
lance-datafusion = { version = "=0.25.3", tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" }
lance-encoding = { version = "=0.25.3", tag = "v0.25.3-beta.2", git = "https://github.com/lancedb/lance" }
], tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
lance-io = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
lance-index = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
lance-linalg = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
lance-table = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
lance-testing = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
lance-datafusion = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
lance-encoding = { version = "=0.25.3", tag = "v0.25.3-beta.4", git = "https://github.com/lancedb/lance" }
# Note that this one does not include pyarrow
arrow = { version = "54.1", optional = false }
arrow-array = "54.1"

View File

@@ -162,7 +162,6 @@ nav:
- Choosing right query type: guides/tuning_retrievers/1_query_types.md
- Reranking: guides/tuning_retrievers/2_reranking.md
- Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md
- Build MCP with LanceDB: guides/mcp.md
- 🧬 Managing embeddings:
- Understand Embeddings: embeddings/understanding_embeddings.md
- Get Started: embeddings/index.md
@@ -294,7 +293,6 @@ nav:
- Choosing right query type: guides/tuning_retrievers/1_query_types.md
- Reranking: guides/tuning_retrievers/2_reranking.md
- Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md
- Build MCP with LanceDB: guides/mcp.md
- Managing Embeddings:
- Understand Embeddings: embeddings/understanding_embeddings.md
- Get Started: embeddings/index.md

View File

@@ -1,126 +0,0 @@
# MCP server with LanceDB
The Model Context Protocol (MCP) is an open protocol that enables seamless integration between LLM applications and external data sources and tools. Whether you're building an AI-powered IDE, enhancing a chat interface, or creating custom AI workflows, MCP provides a standardized way to connect LLMs with the context they need.
With LanceDB, your MCP can be embedded in your application. Let's implement 2 simple MCP tools using LanceDB
1. Add data - add data to LanceDB
2. Retreive data - retrieve data from LanceDB
You need to install `mcp[cli]` python package.
First, let's define some configs:
```python
# mcp_server.py
LANCEDB_URI = "~/lancedb"
TABLE_NAME = "mcp_data"
EMBEDDING_FUNCTION = "sentence-transformers"
MODEL_NAME = "all-MiniLM-L6-v2"
```
Then initialize the table that we'll use to store and retreive data:
```python
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
model = get_registry().get(EMBEDDING_FUNCTION).create(model_name=MODEL_NAME)
class Schema(LanceModel):
text: str = model.SourceField()
vector: Vector(model.ndims()) = model.VectorField()
db = lancedb.connect(LANCEDB_URI)
if TABLE_NAME not in db.table_names():
db.create_table(TABLE_NAME, schema=Schema)
```
!!! Note "Using LanceDB cloud"
If you want to use LanceDB cloud, you'll need to set the uri to your remote table
instance and also provide a token. Every other functionality will remain the same
## Defining the tools
Tools let LLMs take actions through your server. There are other components like `resources` that allow you to expose certain data sources to LLMs. For our use case, we need to define tools that LLMs can call in order to inget or retrieve data
We'll use `FastMCP` interface of the MCP package. The FastMCP server is your core interface to the MCP protocol. It handles connection management, protocol compliance, and message routing.
```python
from mcp.server.fastmcp import FastMCP
mcp = FastMCP("lancedb-example")
```
### Add data ingestion tool
This function takes a string as input and adds it to the LanceDB table.
```python
@mcp.tool()
async def ingest_data(content: str) -> str:
"""
Add a new memory to the vector database
Args:
content: Content of the memory
"""
tbl = db[TABLE_NAME]
tbl.add([
{"text": content}
])
return f"Added memory: {content}"
```
### Retreive data tool
```python
@mcp.tool()
async def retrieve_data(query: str, limit: int = 5) -> str:
"""
Search db using vector search
Args:
query: The search query
limit: Maximum number of results to return
"""
tbl = db[TABLE_NAME]
rs = tbl.search(query).limit(limit).to_list()
data = [
r["text"] for r in rs
]
if not data:
return "No relevant data found."
return "\n\n".join(data)
```
This function takes a string and limit as input and searches the LanceDB table for the most relevant memories.
## Install it on Claude desktop
To install this MCP, you can simply run this command and it'll be registered on you Claude desktop
```
mcp install mcp_server.py
```
You'll see logs similar to this:
```
[04/07/25 20:18:08] INFO Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5 SentenceTransformer.py:218
Batches: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 4.06it/s]
[04/07/25 20:18:11] INFO Added server 'lancedb' to Claude config claude.py:129
INFO Successfully installed lancedb in Claude app cli.py:467
```
Now simply fire up claude desktop and you can start using it.
1. If installed correctly, you'll `lancedb` in the MCP apps list
![Screenshot 2025-04-08 at 8 07 39 AM](https://github.com/user-attachments/assets/6dede8ae-7e39-4931-ae60-b57ce620b328)
2. You can now use the `ingest_data` tool to add data to the table. To do that, you can simply ask claude using natural language
![Screenshot 2025-04-08 at 8 10 37 AM](https://github.com/user-attachments/assets/0cd4df4e-98bb-4bf1-8566-1671eb310a1d)
3. Now you can start asking questions using the `retrieve_data` tool. It'll automatically search the table for relevant data. You should see something like this
![Screenshot 2025-04-08 at 8 11 49 AM](https://github.com/user-attachments/assets/71b5b232-601c-4864-9d52-9b84f16adad9)
4. Claude tries to set the params for tool calling on its own but you can also specify the details.
![Screenshot 2025-04-08 at 8 12 30 AM](https://github.com/user-attachments/assets/5f362bd1-b2fc-4145-8f1e-968d453bf615)
## Community examples
- Find a minimal LanceDB mcp server similar to this [here](https://github.com/kyryl-opens-ml/mcp-server-lancedb/blob/main/src/mcp_lance_db/server.py)
- You can find an implementation of a more complex MCP server that uses LanceDB to implement an advanced CodeQA feature [here](https://github.com/lancedb/MCPExample).

View File

@@ -1304,6 +1304,27 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
expect(results[0].text).toBe(data[0].text);
});
test("full text index on list", async () => {
const db = await connect(tmpDir.name);
const data = [
{ text: ["lance database", "the", "search"], vector: [0.1, 0.2, 0.3] },
{ text: ["lance database"], vector: [0.4, 0.5, 0.6] },
{ text: ["lance", "search"], vector: [0.7, 0.8, 0.9] },
{ text: ["database", "search"], vector: [1.0, 1.1, 1.2] },
{ text: ["unrelated", "doc"], vector: [1.3, 1.4, 1.5] },
];
const table = await db.createTable("test", data);
await table.createIndex("text", {
config: Index.fts(),
});
const results = await table.search("lance").toArray();
expect(results.length).toBe(3);
const results2 = await table.search('"lance database"').toArray();
expect(results2.length).toBe(2);
});
test("full text search without positions", async () => {
const db = await connect(tmpDir.name);
const data = [

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.22.0-beta.5"
current_version = "0.22.0-beta.6"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.22.0-beta.5"
version = "0.22.0-beta.6"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -22,6 +22,7 @@ from lancedb.db import DBConnection
from lancedb.index import FTS
from lancedb.query import BoostQuery, MatchQuery, MultiMatchQuery, PhraseQuery
import numpy as np
import pyarrow as pa
import pandas as pd
import pytest
from utils import exception_output
@@ -626,3 +627,32 @@ def test_language(mem_db: DBConnection):
# Stop words -> no results
results = table.search("la", query_type="fts").limit(5).to_list()
assert len(results) == 0
def test_fts_on_list(mem_db: DBConnection):
data = pa.table(
{
"text": [
["lance database", "the", "search"],
["lance database"],
["lance", "search"],
["database", "search"],
["unrelated", "doc"],
],
"vector": [
[1.0, 2.0, 3.0],
[4.0, 5.0, 6.0],
[7.0, 8.0, 9.0],
[10.0, 11.0, 12.0],
[13.0, 14.0, 15.0],
],
}
)
table = mem_db.create_table("test", data=data)
table.create_fts_index("text", use_tantivy=False)
res = table.search("lance").limit(5).to_list()
assert len(res) == 3
res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
assert len(res) == 2

View File

@@ -158,7 +158,17 @@ pub fn supported_label_list_data_type(dtype: &DataType) -> bool {
}
pub fn supported_fts_data_type(dtype: &DataType) -> bool {
matches!(dtype, DataType::Utf8 | DataType::LargeUtf8)
supported_fts_data_type_impl(dtype, false)
}
fn supported_fts_data_type_impl(dtype: &DataType, in_list: bool) -> bool {
match (dtype, in_list) {
(DataType::Utf8 | DataType::LargeUtf8, _) => true,
(DataType::List(field) | DataType::LargeList(field), false) => {
supported_fts_data_type_impl(field.data_type(), true)
}
_ => false,
}
}
pub fn supported_vector_data_type(dtype: &DataType) -> bool {