Compare commits

...

11 Commits

Author SHA1 Message Date
Lance Release
e291212ecf Bump version: 0.15.0-beta.0 → 0.15.0 2024-10-29 22:16:05 +00:00
Lance Release
edc6445f6f Bump version: 0.14.1-beta.1 → 0.15.0-beta.0 2024-10-29 22:16:05 +00:00
Will Jones
a324f4ad7a feat(node): enable logging and show full errors (#1775)
This exposes the `LANCEDB_LOG` environment variable in node, so that
users can now turn on logging.

In addition, fixes a bug where only the top-level error from Rust was
being shown. This PR makes sure the full error chain is included in the
error message. In the future, will improve this so the error chain is
set on the [cause](https://nodejs.org/api/errors.html#errorcause)
property of JS errors https://github.com/lancedb/lancedb/issues/1779

Fixes #1774
2024-10-29 15:13:34 -07:00
Weston Pace
55104c5bae feat: allow distance type (metric) to be specified during hybrid search (#1777) 2024-10-29 13:51:18 -07:00
Rithik Kumar
d71df4572e docs: revamp langchain integration page (#1773)
Before - 
<img width="1030" alt="Screenshot 2024-10-28 132932"
src="https://github.com/user-attachments/assets/63f78bfa-949e-473e-ab22-0c692577fa3e">


After - 
<img width="1037" alt="Screenshot 2024-10-28 132727"
src="https://github.com/user-attachments/assets/85a12f6c-74f0-49ba-9f1a-fe77ad125704">
2024-10-29 22:55:50 +05:30
Rithik Kumar
aa269199ad docs: fix archived examples links (#1751) 2024-10-29 22:55:27 +05:30
BubbleCal
32fdcf97db feat!: upgrade lance to 0.19.1 (#1762)
BREAKING CHANGE: default tokenizer no longer does stemming or stop-word
removal. Users should explicitly turn that option on in the future.

- upgrade lance to 0.19.1
- update the FTS docs
- update the FTS API

Upstream change notes:
https://github.com/lancedb/lance/releases/tag/v0.19.1

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
Co-authored-by: Will Jones <willjones127@gmail.com>
2024-10-29 09:03:52 -07:00
Ryan Green
b9802a0d23 Revert "fix: error during deserialization of "INVERTED" index type"
This reverts commit 2ea5939f85.
2024-10-25 14:46:47 -02:30
Ryan Green
2ea5939f85 fix: error during deserialization of "INVERTED" index type 2024-10-25 14:40:14 -02:30
Lance Release
04e1f1ee4c Updating package-lock.json 2024-10-23 00:34:22 +00:00
Lance Release
bbc588e27d Bump version: 0.11.1-beta.0 → 0.11.1-beta.1 2024-10-23 00:34:01 +00:00
50 changed files with 870 additions and 376 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.11.1-beta.0"
current_version = "0.11.1-beta.1"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -18,17 +18,16 @@ repository = "https://github.com/lancedb/lancedb"
description = "Serverless, low-latency vector database for AI applications"
keywords = ["lancedb", "lance", "database", "vector", "search"]
categories = ["database-implementations"]
rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
[workspace.dependencies]
lance = { "version" = "=0.18.3", "features" = [
"dynamodb",
], git = "https://github.com/lancedb/lance.git", tag = "v0.18.3-beta.2" }
lance-index = { "version" = "=0.18.3", git = "https://github.com/lancedb/lance.git", tag = "v0.18.3-beta.2" }
lance-linalg = { "version" = "=0.18.3", git = "https://github.com/lancedb/lance.git", tag = "v0.18.3-beta.2" }
lance-table = { "version" = "=0.18.3", git = "https://github.com/lancedb/lance.git", tag = "v0.18.3-beta.2" }
lance-testing = { "version" = "=0.18.3", git = "https://github.com/lancedb/lance.git", tag = "v0.18.3-beta.2" }
lance-datafusion = { "version" = "=0.18.3", git = "https://github.com/lancedb/lance.git", tag = "v0.18.3-beta.2" }
lance-encoding = { "version" = "=0.18.3", git = "https://github.com/lancedb/lance.git", tag = "v0.18.3-beta.2" }
lance = { "version" = "=0.19.1", "features" = ["dynamodb"] }
lance-index = { "version" = "=0.19.1" }
lance-linalg = { "version" = "=0.19.1" }
lance-table = { "version" = "=0.19.1" }
lance-testing = { "version" = "=0.19.1" }
lance-datafusion = { "version" = "=0.19.1" }
lance-encoding = { "version" = "=0.19.1" }
# Note that this one does not include pyarrow
arrow = { version = "52.2", optional = false }
arrow-array = "52.2"
@@ -42,6 +41,7 @@ async-trait = "0"
chrono = "0.4.35"
datafusion-common = "41.0"
datafusion-physical-plan = "41.0"
env_logger = "0.10"
half = { "version" = "=2.4.1", default-features = false, features = [
"num-traits",
] }

View File

@@ -100,7 +100,7 @@ nav:
- 🏃🏼‍♂️ Quick start: basic.md
- 📚 Concepts:
- Vector search: concepts/vector_search.md
- Indexing:
- Indexing:
- IVFPQ: concepts/index_ivfpq.md
- HNSW: concepts/index_hnsw.md
- Storage: concepts/storage.md
@@ -109,7 +109,8 @@ nav:
- Working with tables: guides/tables.md
- Building a vector index: ann_indexes.md
- Vector Search: search.md
- Full-text search: fts.md
- Full-text search (native): fts.md
- Full-text search (tantivy-based): fts_tantivy.md
- Building a scalar index: guides/scalar_index.md
- Hybrid search:
- Overview: hybrid_search/hybrid_search.md
@@ -148,10 +149,10 @@ nav:
- Reranking: guides/tuning_retrievers/2_reranking.md
- Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md
- 🧬 Managing embeddings:
- Understand Embeddings: embeddings/understanding_embeddings.md
- Understand Embeddings: embeddings/understanding_embeddings.md
- Get Started: embeddings/index.md
- Embedding functions: embeddings/embedding_functions.md
- Available models:
- Available models:
- Overview: embeddings/default_embedding_functions.md
- Text Embedding Functions:
- Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md
@@ -200,7 +201,7 @@ nav:
- Evaluation: examples/python_examples/evaluations.md
- AI Agent: examples/python_examples/aiagent.md
- Recommender System: examples/python_examples/recommendersystem.md
- Miscellaneous:
- Miscellaneous:
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
- 👾 JavaScript:
@@ -210,9 +211,10 @@ nav:
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
- 🦀 Rust:
- Overview: examples/examples_rust.md
- Studies:
- 📓 Studies:
- ↗Improve retrievers with hybrid search and reranking: https://blog.lancedb.com/hybrid-search-and-reranking-report/
- 💭 FAQs: faq.md
- 🔍 Troubleshooting: troubleshooting.md
- ⚙️ API reference:
- 🐍 Python: python/python.md
- 👾 JavaScript (vectordb): javascript/modules.md
@@ -228,7 +230,7 @@ nav:
- Quick start: basic.md
- Concepts:
- Vector search: concepts/vector_search.md
- Indexing:
- Indexing:
- IVFPQ: concepts/index_ivfpq.md
- HNSW: concepts/index_hnsw.md
- Storage: concepts/storage.md
@@ -237,7 +239,8 @@ nav:
- Working with tables: guides/tables.md
- Building an ANN index: ann_indexes.md
- Vector Search: search.md
- Full-text search: fts.md
- Full-text search (native): fts.md
- Full-text search (tantivy-based): fts_tantivy.md
- Building a scalar index: guides/scalar_index.md
- Hybrid search:
- Overview: hybrid_search/hybrid_search.md
@@ -276,10 +279,10 @@ nav:
- Reranking: guides/tuning_retrievers/2_reranking.md
- Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md
- Managing Embeddings:
- Understand Embeddings: embeddings/understanding_embeddings.md
- Understand Embeddings: embeddings/understanding_embeddings.md
- Get Started: embeddings/index.md
- Embedding functions: embeddings/embedding_functions.md
- Available models:
- Available models:
- Overview: embeddings/default_embedding_functions.md
- Text Embedding Functions:
- Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md
@@ -324,7 +327,7 @@ nav:
- Evaluation: examples/python_examples/evaluations.md
- AI Agent: examples/python_examples/aiagent.md
- Recommender System: examples/python_examples/recommendersystem.md
- Miscellaneous:
- Miscellaneous:
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
- 👾 JavaScript:
@@ -367,5 +370,4 @@ extra:
- icon: fontawesome/brands/x-twitter
link: https://twitter.com/lancedb
- icon: fontawesome/brands/linkedin
link: https://www.linkedin.com/company/lancedb
link: https://www.linkedin.com/company/lancedb

View File

@@ -36,6 +36,6 @@
[aware_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/chatbot_using_Llama2_&_lanceDB/main.ipynb
[aware_ghost]: https://blog.lancedb.com/context-aware-chatbot-using-llama-2-lancedb-as-vector-database-4d771d95c755
[csv_github]: https://github.com/lancedb/vectordb-recipes/blob/main/tutorials/Chat_with_csv_file
[csv_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/Chat_with_csv_file/main.ipynb
[csv_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/Chat_with_csv_file
[csv_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/Chat_with_csv_file/main.ipynb
[csv_ghost]: https://blog.lancedb.com/p/d8c71df4-e55f-479a-819e-cde13354a6a3/

View File

@@ -12,7 +12,7 @@ LanceDB supports multimodal search by indexing and querying vector representatio
|:----------------|:-----------------|:-----------|
| **Multimodal CLIP: DiffusionDB 🌐💥** | Multi-Modal Search with **CLIP** and **LanceDB** Using **DiffusionDB** Data for Combined Text and Image Understanding ! 🔓 | [![GitHub](../../assets/github.svg)][Clip_diffusionDB_github] <br>[![Open In Collab](../../assets/colab.svg)][Clip_diffusionDB_colab] <br>[![Python](../../assets/python.svg)][Clip_diffusionDB_python] <br>[![Ghost](../../assets/ghost.svg)][Clip_diffusionDB_ghost] |
| **Multimodal CLIP: Youtube Videos 📹👀** | Search **Youtube videos** using Multimodal CLIP, finding relevant content with ease and accuracy! 🎯 | [![Github](../../assets/github.svg)][Clip_youtube_github] <br>[![Open In Collab](../../assets/colab.svg)][Clip_youtube_colab] <br> [![Python](../../assets/python.svg)][Clip_youtube_python] <br>[![Ghost](../../assets/ghost.svg)][Clip_youtube_python] |
| **Multimodal Image + Text Search 📸🔍** | Find **relevant documents** and **images** with a single query using **LanceDB's** multimodal search capabilities, to seamlessly integrate text and visuals ! 🌉 | [![GitHub](../../assets/github.svg)](https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_search) <br>[![Open In Collab](../../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/multimodal_search/main.ipynb) <br> [![Python](../../assets/python.svg)](https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_search/main.py)<br> [![Ghost](../../assets/ghost.svg)](https://blog.lancedb.com/multi-modal-ai-made-easy-with-lancedb-clip-5aaf8801c939/) |
| **Multimodal Image + Text Search 📸🔍** | Find **relevant documents** and **images** with a single query using **LanceDB's** multimodal search capabilities, to seamlessly integrate text and visuals ! 🌉 | [![GitHub](../../assets/github.svg)](https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/multimodal_search) <br>[![Open In Collab](../../assets/colab.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/multimodal_search/main.ipynb) <br> [![Python](../../assets/python.svg)](https://github.com/lancedb/vectordb-recipes/blob/main/examples/multimodal_search/main.py)<br> [![Ghost](../../assets/ghost.svg)](https://blog.lancedb.com/multi-modal-ai-made-easy-with-lancedb-clip-5aaf8801c939/) |
| **Cambrian-1: Vision-Centric Image Exploration 🔍👀** | Learn how **Cambrian-1** works, using an example of **Vision-Centric** exploration on images found through vector search ! Work on **Flickr-8k** dataset 🔎 | [![Kaggle](https://img.shields.io/badge/Kaggle-035a7d?style=for-the-badge&logo=kaggle&logoColor=white)](https://www.kaggle.com/code/prasantdixit/cambrian-1-vision-centric-exploration-of-images/)<br> [![Ghost](../../assets/ghost.svg)](https://blog.lancedb.com/cambrian-1-vision-centric-exploration/) |

View File

@@ -70,12 +70,12 @@ Build RAG (Retrieval-Augmented Generation) with LanceDB, a powerful solution fo
[flare_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/better-rag-FLAIR/main.ipynb
[flare_ghost]: https://blog.lancedb.com/better-rag-with-active-retrieval-augmented-generation-flare-3b66646e2a9f/
[query_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/QueryExpansion&Reranker
[query_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/QueryExpansion&Reranker/main.ipynb
[query_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/QueryExpansion%26Reranker
[query_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/QueryExpansion&Reranker/main.ipynb
[fusion_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/RAG_Fusion
[fusion_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/RAG_Fusion/main.ipynb
[fusion_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/RAG_Fusion
[fusion_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/RAG_Fusion/main.ipynb
[agentic_github]: https://github.com/lancedb/vectordb-recipes/blob/main/tutorials/Agentic_RAG
[agentic_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/Agentic_RAG/main.ipynb

View File

@@ -19,8 +19,8 @@ Deliver personalized experiences with Recommender Systems. 🎁
[movie_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/movie-recommender/main.py
[genre_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/movie-recommendation-with-genres
[genre_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/movie-recommendation-with-genres/movie_recommendation_with_doc2vec_and_lancedb.ipynb
[genre_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/movie-recommendation-with-genres
[genre_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/movie-recommendation-with-genres/movie_recommendation_with_doc2vec_and_lancedb.ipynb
[genre_ghost]: https://blog.lancedb.com/movie-recommendation-system-using-lancedb-and-doc2vec/
[product_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/product-recommender
@@ -33,5 +33,5 @@ Deliver personalized experiences with Recommender Systems. 🎁
[arxiv_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/arxiv-recommender/main.py
[food_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Food_recommendation
[food_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Food_recommendation/main.ipynb
[food_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/Food_recommendation
[food_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/Food_recommendation/main.ipynb

View File

@@ -37,16 +37,16 @@ LanceDB implements vector search algorithms for efficient document retrieval and
[NER_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/NER-powered-Semantic-Search/NER_powered_Semantic_Search_with_LanceDB.ipynb
[NER_ghost]: https://blog.lancedb.com/ner-powered-semantic-search-using-lancedb-51051dc3e493
[audio_search_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/audio_search
[audio_search_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/audio_search/main.ipynb
[audio_search_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/audio_search/main.py
[audio_search_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/audio_search
[audio_search_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/audio_search/main.ipynb
[audio_search_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/archived_examples/audio_search/main.py
[mls_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/multi-lingual-wiki-qa
[mls_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/multi-lingual-wiki-qa/main.ipynb
[mls_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/multi-lingual-wiki-qa/main.py
[mls_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/multi-lingual-wiki-qa
[mls_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/multi-lingual-wiki-qa/main.ipynb
[mls_python]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/archived_examples/multi-lingual-wiki-qa/main.py
[fr_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/facial_recognition
[fr_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/facial_recognition/main.ipynb
[fr_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/facial_recognition
[fr_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/facial_recognition/main.ipynb
[sentiment_analysis_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/Sentiment-Analysis-Analyse-Hotel-Reviews
[sentiment_analysis_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Sentiment-Analysis-Analyse-Hotel-Reviews/Sentiment_Analysis_using_LanceDB.ipynb
@@ -70,8 +70,8 @@ LanceDB implements vector search algorithms for efficient document retrieval and
[openvino_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Accelerate-Vector-Search-Applications-Using-OpenVINO/clip_text_image_search.ipynb
[openvino_ghost]: https://blog.lancedb.com/accelerate-vector-search-applications-using-openvino-lancedb/
[zsic_github]: https://github.com/lancedb/vectordb-recipes/blob/main/examples/zero-shot-image-classification
[zsic_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/zero-shot-image-classification/main.ipynb
[zsic_github]: https://github.com/lancedb/vectordb-recipes/tree/main/examples/archived_examples/zero-shot-image-classification
[zsic_colab]: https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/archived_examples/zero-shot-image-classification/main.ipynb
[zsic_ghost]: https://blog.lancedb.com/zero-shot-image-classification-with-vector-search/

View File

@@ -1,21 +1,9 @@
# Full-text search
# Full-text search (Native FTS)
LanceDB provides support for full-text search via Lance (before via [Tantivy](https://github.com/quickwit-oss/tantivy) (Python only)), allowing you to incorporate keyword-based search (based on BM25) in your retrieval solutions.
Currently, the Lance full text search is missing some features that are in the Tantivy full text search. This includes query parser and customizing the tokenizer. Thus, in Python, Tantivy is still the default way to do full text search and many of the instructions below apply just to Tantivy-based indices.
## Installation (Only for Tantivy-based FTS)
LanceDB provides support for full-text search via Lance, allowing you to incorporate keyword-based search (based on BM25) in your retrieval solutions.
!!! note
No need to install the tantivy dependency if using native FTS
To use full-text search, install the dependency [`tantivy-py`](https://github.com/quickwit-oss/tantivy-py):
```sh
# Say you want to use tantivy==0.20.1
pip install tantivy==0.20.1
```
The Python SDK uses tantivy-based FTS by default, need to pass `use_tantivy=False` to use native FTS.
## Example
@@ -39,7 +27,7 @@ Consider that we have a LanceDB table named `my_table`, whose string column `tex
# passing `use_tantivy=False` to use lance FTS index
# `use_tantivy=True` by default
table.create_fts_index("text")
table.create_fts_index("text", use_tantivy=False)
table.search("puppy").limit(10).select(["text"]).to_list()
# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}]
# ...
@@ -93,51 +81,40 @@ Consider that we have a LanceDB table named `my_table`, whose string column `tex
```
It would search on all indexed columns by default, so it's useful when there are multiple indexed columns.
For now, this is supported in tantivy way only.
Passing `fts_columns="text"` if you want to specify the columns to search, but it's not available for Tantivy-based full text search.
Passing `fts_columns="text"` if you want to specify the columns to search.
!!! note
LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead.
## Tokenization
By default the text is tokenized by splitting on punctuation and whitespaces and then removing tokens that are longer than 40 chars. For more language specific tokenization then provide the argument tokenizer_name with the 2 letter language code followed by "_stem". So for english it would be "en_stem".
By default the text is tokenized by splitting on punctuation and whitespaces, and would filter out words that are with length greater than 40, and lowercase all words.
For now, only the Tantivy-based FTS index supports to specify the tokenizer, so it's only available in Python with `use_tantivy=True`.
Stemming is useful for improving search results by reducing words to their root form, e.g. "running" to "run". LanceDB supports stemming for multiple languages, you can specify the tokenizer name to enable stemming by the pattern `tokenizer_name="{language_code}_stem"`, e.g. `en_stem` for English.
=== "use_tantivy=True"
```python
table.create_fts_index("text", use_tantivy=True, tokenizer_name="en_stem")
```
=== "use_tantivy=False"
[**Not supported yet**](https://github.com/lancedb/lance/issues/1195)
For example, to enable stemming for English:
```python
table.create_fts_index("text", use_tantivy=True, tokenizer_name="en_stem")
```
the following [languages](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) are currently supported.
## Index multiple columns
The tokenizer is customizable, you can specify how the tokenizer splits the text, and how it filters out words, etc.
If you have multiple string columns to index, there's no need to combine them manually -- simply pass them all as a list to `create_fts_index`:
=== "use_tantivy=True"
```python
table.create_fts_index(["text1", "text2"])
```
=== "use_tantivy=False"
[**Not supported yet**](https://github.com/lancedb/lance/issues/1195)
Note that the search API call does not change - you can search over all indexed columns at once.
For example, for language with accents, you can specify the tokenizer to use `ascii_folding` to remove accents, e.g. 'é' to 'e':
```python
table.create_fts_index("text",
use_tantivy=False,
language="French",
stem=True,
ascii_folding=True)
```
## Filtering
Currently the LanceDB full text search feature supports *post-filtering*, meaning filters are
applied on top of the full text search results. This can be invoked via the familiar
`where` syntax:
LanceDB full text search supports to filter the search results by a condition, both pre-filtering and post-filtering are supported.
This can be invoked via the familiar `where` syntax:
=== "Python"
@@ -169,98 +146,17 @@ applied on top of the full text search results. This can be invoked via the fami
.await?;
```
## Sorting
!!! warning "Warn"
Sorting is available for only Tantivy-based FTS
You can pre-sort the documents by specifying `ordering_field_names` when
creating the full-text search index. Once pre-sorted, you can then specify
`ordering_field_name` while searching to return results sorted by the given
field. For example,
```python
table.create_fts_index(["text_field"], use_tantivy=True, ordering_field_names=["sort_by_field"])
(table.search("terms", ordering_field_name="sort_by_field")
.limit(20)
.to_list())
```
!!! note
If you wish to specify an ordering field at query time, you must also
have specified it during indexing time. Otherwise at query time, an
error will be raised that looks like `ValueError: The field does not exist: xxx`
!!! note
The fields to sort on must be of typed unsigned integer, or else you will see
an error during indexing that looks like
`TypeError: argument 'value': 'float' object cannot be interpreted as an integer`.
!!! note
You can specify multiple fields for ordering at indexing time.
But at query time only one ordering field is supported.
## Phrase queries vs. terms queries
!!! warning "Warn"
Lance-based FTS doesn't support queries using boolean operators `OR`, `AND`.
For full-text search you can specify either a **phrase** query like `"the old man and the sea"`,
or a **terms** search query like `"(Old AND Man) AND Sea"`. For more details on the terms
or a **terms** search query like `old man sea`. For more details on the terms
query syntax, see Tantivy's [query parser rules](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html).
!!! tip "Note"
The query parser will raise an exception on queries that are ambiguous. For example, in the query `they could have been dogs OR cats`, `OR` is capitalized so it's considered a keyword query operator. But it's ambiguous how the left part should be treated. So if you submit this search query as is, you'll get `Syntax Error: they could have been dogs OR cats`.
```py
# This raises a syntax error
table.search("they could have been dogs OR cats")
```
On the other hand, lowercasing `OR` to `or` will work, because there are no capitalized logical operators and
the query is treated as a phrase query.
```py
# This works!
table.search("they could have been dogs or cats")
```
It can be cumbersome to have to remember what will cause a syntax error depending on the type of
query you want to perform. To make this simpler, when you want to perform a phrase query, you can
enforce it in one of two ways:
1. Place the double-quoted query inside single quotes. For example, `table.search('"they could have been dogs OR cats"')` is treated as
a phrase query.
1. Explicitly declare the `phrase_query()` method. This is useful when you have a phrase query that
itself contains double quotes. For example, `table.search('the cats OR dogs were not really "pets" at all').phrase_query()`
is treated as a phrase query.
In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested
double quotes replaced by single quotes.
## Configurations (Only for Tantivy-based FTS)
By default, LanceDB configures a 1GB heap size limit for creating the index. You can
reduce this if running on a smaller node, or increase this for faster performance while
indexing a larger corpus.
To search for a phrase, the index must be created with `with_position=True`:
```python
# configure a 512MB heap size
heap = 1024 * 1024 * 512
table.create_fts_index(["text1", "text2"], writer_heap_size=heap, replace=True)
table.create_fts_index("text", use_tantivy=False, with_position=True)
```
## Current limitations
For that Tantivy-based FTS:
1. Currently we do not yet support incremental writes.
If you add data after FTS index creation, it won't be reflected
in search results until you do a full reindex.
2. We currently only support local filesystem paths for the FTS index.
This is a tantivy limitation. We've implemented an object store plugin
but there's no way in tantivy-py to specify to use it.
This will allow you to search for phrases, but it will also significantly increase the index size and indexing time.

162
docs/src/fts_tantivy.md Normal file
View File

@@ -0,0 +1,162 @@
# Full-text search (Tantivy-based FTS)
LanceDB also provides support for full-text search via [Tantivy](https://github.com/quickwit-oss/tantivy), allowing you to incorporate keyword-based search (based on BM25) in your retrieval solutions.
The tantivy-based FTS is only available in Python and does not support building indexes on object storage or incremental indexing. If you need these features, try native FTS [native FTS](fts.md).
## Installation
To use full-text search, install the dependency [`tantivy-py`](https://github.com/quickwit-oss/tantivy-py):
```sh
# Say you want to use tantivy==0.20.1
pip install tantivy==0.20.1
```
## Example
Consider that we have a LanceDB table named `my_table`, whose string column `content` we want to index and query via keyword search, the FTS index must be created before you can search via keywords.
```python
import lancedb
uri = "data/sample-lancedb"
db = lancedb.connect(uri)
table = db.create_table(
"my_table",
data=[
{"id": 1, "vector": [3.1, 4.1], "title": "happy puppy", "content": "Frodo was a happy puppy", "meta": "foo"},
{"id": 2, "vector": [5.9, 26.5], "title": "playing kittens", "content": "There are several kittens playing around the puppy", "meta": "bar"},
],
)
# passing `use_tantivy=False` to use lance FTS index
# `use_tantivy=True` by default
table.create_fts_index("content", use_tantivy=True)
table.search("puppy").limit(10).select(["content"]).to_list()
# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}]
# ...
```
It would search on all indexed columns by default, so it's useful when there are multiple indexed columns.
!!! note
LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead.
## Tokenization
By default the text is tokenized by splitting on punctuation and whitespaces and then removing tokens that are longer than 40 chars. For more language specific tokenization then provide the argument tokenizer_name with the 2 letter language code followed by "_stem". So for english it would be "en_stem".
```python
table.create_fts_index("content", use_tantivy=True, tokenizer_name="en_stem", replace=True)
```
the following [languages](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) are currently supported.
## Index multiple columns
If you have multiple string columns to index, there's no need to combine them manually -- simply pass them all as a list to `create_fts_index`:
```python
table.create_fts_index(["title", "content"], use_tantivy=True, replace=True)
```
Note that the search API call does not change - you can search over all indexed columns at once.
## Filtering
Currently the LanceDB full text search feature supports *post-filtering*, meaning filters are
applied on top of the full text search results (see [native FTS](fts.md) if you need pre-filtering). This can be invoked via the familiar
`where` syntax:
```python
table.search("puppy").limit(10).where("meta='foo'").to_list()
```
## Sorting
You can pre-sort the documents by specifying `ordering_field_names` when
creating the full-text search index. Once pre-sorted, you can then specify
`ordering_field_name` while searching to return results sorted by the given
field. For example,
```python
table.create_fts_index(["content"], use_tantivy=True, ordering_field_names=["id"], replace=True)
(table.search("puppy", ordering_field_name="id")
.limit(20)
.to_list())
```
!!! note
If you wish to specify an ordering field at query time, you must also
have specified it during indexing time. Otherwise at query time, an
error will be raised that looks like `ValueError: The field does not exist: xxx`
!!! note
The fields to sort on must be of typed unsigned integer, or else you will see
an error during indexing that looks like
`TypeError: argument 'value': 'float' object cannot be interpreted as an integer`.
!!! note
You can specify multiple fields for ordering at indexing time.
But at query time only one ordering field is supported.
## Phrase queries vs. terms queries
For full-text search you can specify either a **phrase** query like `"the old man and the sea"`,
or a **terms** search query like `"(Old AND Man) AND Sea"`. For more details on the terms
query syntax, see Tantivy's [query parser rules](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html).
!!! tip "Note"
The query parser will raise an exception on queries that are ambiguous. For example, in the query `they could have been dogs OR cats`, `OR` is capitalized so it's considered a keyword query operator. But it's ambiguous how the left part should be treated. So if you submit this search query as is, you'll get `Syntax Error: they could have been dogs OR cats`.
```py
# This raises a syntax error
table.search("they could have been dogs OR cats")
```
On the other hand, lowercasing `OR` to `or` will work, because there are no capitalized logical operators and
the query is treated as a phrase query.
```py
# This works!
table.search("they could have been dogs or cats")
```
It can be cumbersome to have to remember what will cause a syntax error depending on the type of
query you want to perform. To make this simpler, when you want to perform a phrase query, you can
enforce it in one of two ways:
1. Place the double-quoted query inside single quotes. For example, `table.search('"they could have been dogs OR cats"')` is treated as
a phrase query.
1. Explicitly declare the `phrase_query()` method. This is useful when you have a phrase query that
itself contains double quotes. For example, `table.search('the cats OR dogs were not really "pets" at all').phrase_query()`
is treated as a phrase query.
In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested
double quotes replaced by single quotes.
## Configurations
By default, LanceDB configures a 1GB heap size limit for creating the index. You can
reduce this if running on a smaller node, or increase this for faster performance while
indexing a larger corpus.
```python
# configure a 512MB heap size
heap = 1024 * 1024 * 512
table.create_fts_index(["title", "content"], use_tantivy=True, writer_heap_size=heap, replace=True)
```
## Current limitations
1. Currently we do not yet support incremental writes.
If you add data after FTS index creation, it won't be reflected
in search results until you do a full reindex.
2. We currently only support local filesystem paths for the FTS index.
This is a tantivy limitation. We've implemented an object store plugin
but there's no way in tantivy-py to specify to use it.

View File

@@ -49,7 +49,8 @@ The following pages go deeper into the internal of LanceDB and how to use it.
* [Working with tables](guides/tables.md): Learn how to work with tables and their associated functions
* [Indexing](ann_indexes.md): Understand how to create indexes
* [Vector search](search.md): Learn how to perform vector similarity search
* [Full-text search](fts.md): Learn how to perform full-text search
* [Full-text search (native)](fts.md): Learn how to perform full-text search
* [Full-text search (tantivy-based)](fts_tantivy.md): Learn how to perform full-text search using Tantivy
* [Managing embeddings](embeddings/index.md): Managing embeddings and the embedding functions API in LanceDB
* [Ecosystem Integrations](integrations/index.md): Integrate LanceDB with other tools in the data ecosystem
* [Python API Reference](python/python.md): Python OSS and Cloud API references

View File

@@ -1,5 +1,10 @@
# Langchain
![Illustration](../assets/langchain.png)
**LangChain** is a framework designed for building applications with large language models (LLMs) by chaining together various components. It supports a range of functionalities including memory, agents, and chat models, enabling developers to create context-aware applications.
![Illustration](https://raw.githubusercontent.com/lancedb/assets/refs/heads/main/docs/assets/integration/langchain_rag.png)
LangChain streamlines these stages (in figure above) by providing pre-built components and tools for integration, memory management, and deployment, allowing developers to focus on application logic rather than underlying complexities.
Integration of **Langchain** with **LanceDB** enables applications to retrieve the most relevant data by comparing query vectors against stored vectors, facilitating effective information retrieval. It results in better and context aware replies and actions by the LLMs.
## Quick Start
You can load your document data using langchain's loaders, for this example we are using `TextLoader` and `OpenAIEmbeddings` as the embedding model. Checkout Complete example here - [LangChain demo](../notebooks/langchain_example.ipynb)
@@ -26,20 +31,28 @@ print(docs[0].page_content)
## Documentation
In the above example `LanceDB` vector store class object is created using `from_documents()` method which is a `classmethod` and returns the initialized class object.
You can also use `LanceDB.from_texts(texts: List[str],embedding: Embeddings)` class method.
The exhaustive list of parameters for `LanceDB` vector store are :
- `connection`: (Optional) `lancedb.db.LanceDBConnection` connection object to use. If not provided, a new connection will be created.
- `embedding`: Langchain embedding model.
- `vector_key`: (Optional) Column name to use for vector's in the table. Defaults to `'vector'`.
- `id_key`: (Optional) Column name to use for id's in the table. Defaults to `'id'`.
- `text_key`: (Optional) Column name to use for text in the table. Defaults to `'text'`.
- `table_name`: (Optional) Name of your table in the database. Defaults to `'vectorstore'`.
- `api_key`: (Optional) API key to use for LanceDB cloud database. Defaults to `None`.
- `region`: (Optional) Region to use for LanceDB cloud database. Only for LanceDB Cloud, defaults to `None`.
- `mode`: (Optional) Mode to use for adding data to the table. Defaults to `'overwrite'`.
- `reranker`: (Optional) The reranker to use for LanceDB.
- `relevance_score_fn`: (Optional[Callable[[float], float]]) Langchain relevance score function to be used. Defaults to `None`.
The exhaustive list of parameters for `LanceDB` vector store are :
|Name|type|Purpose|default|
|:----|:----|:----|:----|
|`connection`| (Optional) `Any` |`lancedb.db.LanceDBConnection` connection object to use. If not provided, a new connection will be created.|`None`|
|`embedding`| (Optional) `Embeddings` | Langchain embedding model.|Provided by user.|
|`uri`| (Optional) `str` |It specifies the directory location of **LanceDB database** and establishes a connection that can be used to interact with the database. |`/tmp/lancedb`|
|`vector_key` |(Optional) `str`| Column name to use for vector's in the table.|`'vector'`|
|`id_key` |(Optional) `str`| Column name to use for id's in the table.|`'id'`|
|`text_key` |(Optional) `str` |Column name to use for text in the table.|`'text'`|
|`table_name` |(Optional) `str`| Name of your table in the database.|`'vectorstore'`|
|`api_key` |(Optional `str`) |API key to use for LanceDB cloud database.|`None`|
|`region` |(Optional) `str`| Region to use for LanceDB cloud database.|Only for LanceDB Cloud : `None`.|
|`mode` |(Optional) `str` |Mode to use for adding data to the table. Valid values are "append" and "overwrite".|`'overwrite'`|
|`table`| (Optional) `Any`|You can connect to an existing table of LanceDB, created outside of langchain, and utilize it.|`None`|
|`distance`|(Optional) `str`|The choice of distance metric used to calculate the similarity between vectors.|`'l2'`|
|`reranker` |(Optional) `Any`|The reranker to use for LanceDB.|`None`|
|`relevance_score_fn` |(Optional) `Callable[[float], float]` | Langchain relevance score function to be used.|`None`|
|`limit`|`int`|Set the maximum number of results to return.|`DEFAULT_K` (it is 4)|
```python
db_url = "db://lang_test" # url of db you created
@@ -51,19 +64,24 @@ vector_store = LanceDB(
api_key=api_key, #(dont include for local API)
region=region, #(dont include for local API)
embedding=embeddings,
table_name='langchain_test' #Optional
table_name='langchain_test' # Optional
)
```
### Methods
##### add_texts()
- `texts`: `Iterable` of strings to add to the vectorstore.
- `metadatas`: Optional `list[dict()]` of metadatas associated with the texts.
- `ids`: Optional `list` of ids to associate with the texts.
- `kwargs`: `Any`
This method adds texts and stores respective embeddings automatically.
This method turn texts into embedding and add it to the database.
|Name|Purpose|defaults|
|:---|:---|:---|
|`texts`|`Iterable` of strings to add to the vectorstore.|Provided by user|
|`metadatas`|Optional `list[dict()]` of metadatas associated with the texts.|`None`|
|`ids`|Optional `list` of ids to associate with the texts.|`None`|
|`kwargs`| Other keyworded arguments provided by the user. |-|
It returns list of ids of the added texts.
```python
vector_store.add_texts(texts = ['test_123'], metadatas =[{'source' :'wiki'}])
@@ -78,14 +96,25 @@ pd_df.to_csv("docsearch.csv", index=False)
# you can also create a new vector store object using an older connection object:
vector_store = LanceDB(connection=tbl, embedding=embeddings)
```
##### create_index()
- `col_name`: `Optional[str] = None`
- `vector_col`: `Optional[str] = None`
- `num_partitions`: `Optional[int] = 256`
- `num_sub_vectors`: `Optional[int] = 96`
- `index_cache_size`: `Optional[int] = None`
This method creates an index for the vector store. For index creation make sure your table has enough data in it. An ANN index is ususally not needed for datasets ~100K vectors. For large-scale (>1M) or higher dimension vectors, it is beneficial to create an ANN index.
------
##### create_index()
This method creates a scalar(for non-vector cols) or a vector index on a table.
|Name|type|Purpose|defaults|
|:---|:---|:---|:---|
|`vector_col`|`Optional[str]`| Provide if you want to create index on a vector column. |`None`|
|`col_name`|`Optional[str]`| Provide if you want to create index on a non-vector column. |`None`|
|`metric`|`Optional[str]` |Provide the metric to use for vector index. choice of metrics: 'L2', 'dot', 'cosine'. |`L2`|
|`num_partitions`|`Optional[int]`|Number of partitions to use for the index.|`256`|
|`num_sub_vectors`|`Optional[int]` |Number of sub-vectors to use for the index.|`96`|
|`index_cache_size`|`Optional[int]` |Size of the index cache.|`None`|
|`name`|`Optional[str]` |Name of the table to create index on.|`None`|
For index creation make sure your table has enough data in it. An ANN index is ususally not needed for datasets ~100K vectors. For large-scale (>1M) or higher dimension vectors, it is beneficial to create an ANN index.
```python
# for creating vector index
@@ -96,42 +125,63 @@ vector_store.create_index(col_name='text')
```
##### similarity_search()
- `query`: `str`
- `k`: `Optional[int] = None`
- `filter`: `Optional[Dict[str, str]] = None`
- `fts`: `Optional[bool] = False`
- `name`: `Optional[str] = None`
- `kwargs`: `Any`
------
Return documents most similar to the query without relevance scores
##### similarity_search()
This method performs similarity search based on **text query**.
| Name | Type | Purpose | Default |
|---------|----------------------|---------|---------|
| `query` | `str` | A `str` representing the text query that you want to search for in the vector store. | N/A |
| `k` | `Optional[int]` | It specifies the number of documents to return. | `None` |
| `filter` | `Optional[Dict[str, str]]`| It is used to filter the search results by specific metadata criteria. | `None` |
| `fts` | `Optional[bool]` | It indicates whether to perform a full-text search (FTS). | `False` |
| `name` | `Optional[str]` | It is used for specifying the name of the table to query. If not provided, it uses the default table set during the initialization of the LanceDB instance. | `None` |
| `kwargs` | `Any` | Other keyworded arguments provided by the user. | N/A |
Return documents most similar to the query **without relevance scores**.
```python
docs = docsearch.similarity_search(query)
print(docs[0].page_content)
```
##### similarity_search_by_vector()
- `embedding`: `List[float]`
- `k`: `Optional[int] = None`
- `filter`: `Optional[Dict[str, str]] = None`
- `name`: `Optional[str] = None`
- `kwargs`: `Any`
------
Returns documents most similar to the query vector.
##### similarity_search_by_vector()
The method returns documents that are most similar to the specified **embedding (query) vector**.
| Name | Type | Purpose | Default |
|-------------|---------------------------|---------|---------|
| `embedding` | `List[float]` | The embedding vector you want to use to search for similar documents in the vector store. | N/A |
| `k` | `Optional[int]` | It specifies the number of documents to return. | `None` |
| `filter` | `Optional[Dict[str, str]]`| It is used to filter the search results by specific metadata criteria. | `None` |
| `name` | `Optional[str]` | It is used for specifying the name of the table to query. If not provided, it uses the default table set during the initialization of the LanceDB instance. | `None` |
| `kwargs` | `Any` | Other keyworded arguments provided by the user. | N/A |
**It does not provide relevance scores.**
```python
docs = docsearch.similarity_search_by_vector(query)
print(docs[0].page_content)
```
##### similarity_search_with_score()
- `query`: `str`
- `k`: `Optional[int] = None`
- `filter`: `Optional[Dict[str, str]] = None`
- `kwargs`: `Any`
------
Returns documents most similar to the query string with relevance scores, gets called by base class's `similarity_search_with_relevance_scores` which selects relevance score based on our `_select_relevance_score_fn`.
##### similarity_search_with_score()
Returns documents most similar to the **query string** along with their relevance scores.
| Name | Type | Purpose | Default |
|----------|---------------------------|---------|---------|
| `query` | `str` |A `str` representing the text query you want to search for in the vector store. This query will be converted into an embedding using the specified embedding function. | N/A |
| `k` | `Optional[int]` | It specifies the number of documents to return. | `None` |
| `filter` | `Optional[Dict[str, str]]`| It is used to filter the search results by specific metadata criteria. This allows you to narrow down the search results based on certain metadata attributes associated with the documents. | `None` |
| `kwargs` | `Any` | Other keyworded arguments provided by the user. | N/A |
It gets called by base class's `similarity_search_with_relevance_scores` which selects relevance score based on our `_select_relevance_score_fn`.
```python
docs = docsearch.similarity_search_with_relevance_scores(query)
@@ -139,15 +189,21 @@ print("relevance score - ", docs[0][1])
print("text- ", docs[0][0].page_content[:1000])
```
##### similarity_search_by_vector_with_relevance_scores()
- `embedding`: `List[float]`
- `k`: `Optional[int] = None`
- `filter`: `Optional[Dict[str, str]] = None`
- `name`: `Optional[str] = None`
- `kwargs`: `Any`
------
Return documents most similar to the query vector with relevance scores.
Relevance score
##### similarity_search_by_vector_with_relevance_scores()
Similarity search using **query vector**.
| Name | Type | Purpose | Default |
|-------------|---------------------------|---------|---------|
| `embedding` | `List[float]` | The embedding vector you want to use to search for similar documents in the vector store. | N/A |
| `k` | `Optional[int]` | It specifies the number of documents to return. | `None` |
| `filter` | `Optional[Dict[str, str]]`| It is used to filter the search results by specific metadata criteria. | `None` |
| `name` | `Optional[str]` | It is used for specifying the name of the table to query. | `None` |
| `kwargs` | `Any` | Other keyworded arguments provided by the user. | N/A |
The method returns documents most similar to the specified embedding (query) vector, along with their relevance scores.
```python
docs = docsearch.similarity_search_by_vector_with_relevance_scores(query_embedding)
@@ -155,20 +211,22 @@ print("relevance score - ", docs[0][1])
print("text- ", docs[0][0].page_content[:1000])
```
##### max_marginal_relevance_search()
- `query`: `str`
- `k`: `Optional[int] = None`
- `fetch_k` : Number of Documents to fetch to pass to MMR algorithm, `Optional[int] = None`
- `lambda_mult`: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5. `float = 0.5`
- `filter`: `Optional[Dict[str, str]] = None`
- `kwargs`: `Any`
------
Returns docs selected using the maximal marginal relevance(MMR).
##### max_marginal_relevance_search()
This method returns docs selected using the maximal marginal relevance(MMR).
Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents.
| Name | Type | Purpose | Default |
|---------------|-----------------|-----------|---------|
| `query` | `str` | Text to look up documents similar to. | N/A |
| `k` | `Optional[int]` | Number of Documents to return.| `4` |
| `fetch_k`| `Optional[int]`| Number of Documents to fetch to pass to MMR algorithm.| `None` |
| `lambda_mult` | `float` | Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. | `0.5` |
| `filter`| `Optional[Dict[str, str]]`| Filter by metadata. | `None` |
|`kwargs`| Other keyworded arguments provided by the user. |-|
Similarly, `max_marginal_relevance_search_by_vector()` function returns docs most similar to the embedding passed to the function using MMR. instead of a string query you need to pass the embedding to be searched for.
```python
@@ -186,12 +244,19 @@ result_texts = [doc.page_content for doc in result]
print(result_texts)
```
##### add_images()
- `uris` : File path to the image. `List[str]`.
- `metadatas` : Optional list of metadatas. `(Optional[List[dict]], optional)`
- `ids` : Optional list of IDs. `(Optional[List[str]], optional)`
------
Adds images by automatically creating their embeddings and adds them to the vectorstore.
##### add_images()
This method ddds images by automatically creating their embeddings and adds them to the vectorstore.
| Name | Type | Purpose | Default |
|------------|-------------------------------|--------------------------------|---------|
| `uris` | `List[str]` | File path to the image | N/A |
| `metadatas`| `Optional[List[dict]]` | Optional list of metadatas | `None` |
| `ids` | `Optional[List[str]]` | Optional list of IDs | `None` |
It returns list of IDs of the added images.
```python
vec_store.add_images(uris=image_uris)

View File

@@ -45,7 +45,7 @@ Let's see how using LanceDB inside phidata helps in making LLM more useful:
**Install the following packages in the virtual environment**
```python
pip install lancedb phidata youtube_transcript_api openai ollama pandas numpy
pip install lancedb phidata youtube_transcript_api openai ollama numpy pandas
```
**Create python files and import necessary libraries**

View File

@@ -0,0 +1,25 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / FtsOptions
# Interface: FtsOptions
Options to create an `FTS` index
## Properties
### withPosition?
> `optional` **withPosition**: `boolean`
Whether to store the positions of the term in the document.
If this is true then the index will store the positions of the term in the document.
This allows phrase queries to be run. But it also increases the size of the index,
and the time to build the index.
The default value is true.
***

View File

@@ -0,0 +1,33 @@
## Getting help
The following sections provide various diagnostics and troubleshooting tips for LanceDB.
These can help you provide additional information when asking questions or making
error reports.
For trouble shooting, the best place to ask is in our Discord, under the relevant
language channel. By asking in the language-specific channel, it makes it more
likely that someone who knows the answer will see your question.
## Enabling logging
To provide more information, especially for LanceDB Cloud related issues, enable
debug logging. You can set the `LANCEDB_LOG` environment variable:
```shell
export LANCEDB_LOG=debug
```
You can turn off colors and formatting in the logs by setting
```shell
export LANCEDB_LOG_STYLE=never
```
## Explaining query plans
If you have slow queries or unexpected query results, it can be helpful to
print the resolved query plan. You can use the `explain_plan` method to do this:
* Python Sync: [LanceQueryBuilder.explain_plan][lancedb.query.LanceQueryBuilder.explain_plan]
* Python Async: [AsyncQueryBase.explain_plan][lancedb.query.AsyncQueryBase.explain_plan]
* Node @lancedb/lancedb: [LanceQueryBuilder.explainPlan](/lancedb/js/classes/QueryBase/#explainplan)

View File

@@ -3,7 +3,7 @@ numpy
pandas
pylance
duckdb
tantivy==0.20.1
--extra-index-url https://download.pytorch.org/whl/cpu
torch
polars>=0.19, <=1.3.0

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.11.1-beta.0</version>
<version>0.11.1-beta.1</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.11.1-beta.0</version>
<version>0.11.1-beta.1</version>
<packaging>pom</packaging>
<name>LanceDB Parent</name>

79
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.11.1-beta.0",
"version": "0.11.1-beta.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.11.1-beta.0",
"version": "0.11.1-beta.1",
"cpu": [
"x64",
"arm64"
@@ -52,11 +52,11 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.11.1-beta.0",
"@lancedb/vectordb-darwin-x64": "0.11.1-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.11.1-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.11.1-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.11.1-beta.0"
"@lancedb/vectordb-darwin-arm64": "0.11.1-beta.1",
"@lancedb/vectordb-darwin-x64": "0.11.1-beta.1",
"@lancedb/vectordb-linux-arm64-gnu": "0.11.1-beta.1",
"@lancedb/vectordb-linux-x64-gnu": "0.11.1-beta.1",
"@lancedb/vectordb-win32-x64-msvc": "0.11.1-beta.1"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",
@@ -326,6 +326,71 @@
"@jridgewell/sourcemap-codec": "^1.4.10"
}
},
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.11.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.11.1-beta.1.tgz",
"integrity": "sha512-q9jcCbmcz45UHmjgecL6zK82WaqUJsARfniwXXPcnd8ooISVhPkgN+RVKv6edwI9T0PV+xVRYq+LQLlZu5fyxw==",
"cpu": [
"arm64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.11.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.11.1-beta.1.tgz",
"integrity": "sha512-E5tCTS5TaTkssTPa+gdnFxZJ1f60jnSIJXhqufNFZk4s+IMViwR1BPqaqE++WY5c1uBI55ef1862CROKDKX4gg==",
"cpu": [
"x64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.11.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.11.1-beta.1.tgz",
"integrity": "sha512-Obohy6TH31Uq+fp6ZisHR7iAsvgVPqBExrycVcIJqrLZnIe88N9OWUwBXkmfMAw/2hNJFwD4tU7+4U2FcBWX4w==",
"cpu": [
"arm64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.11.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.11.1-beta.1.tgz",
"integrity": "sha512-3Meu0dgrzNrnBVVQhxkUSAOhQNmgtKHvOvmrRLUicV+X19hd33udihgxVpZZb9mpXenJ8lZsS+Jq6R0hWqntag==",
"cpu": [
"x64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.11.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.11.1-beta.1.tgz",
"integrity": "sha512-BafZ9OJPQXsS7JW0weAl12wC+827AiRjfUrE5tvrYWZah2OwCF2U2g6uJ3x4pxfwEGsv5xcHFqgxlS7ttFkh+Q==",
"cpu": [
"x64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"win32"
]
},
"node_modules/@neon-rs/cli": {
"version": "0.0.160",
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.11.1-beta.0",
"version": "0.11.1-beta.1",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",
@@ -88,10 +88,10 @@
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.11.1-beta.0",
"@lancedb/vectordb-darwin-x64": "0.11.1-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.11.1-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.11.1-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.11.1-beta.0"
"@lancedb/vectordb-darwin-arm64": "0.11.1-beta.1",
"@lancedb/vectordb-darwin-x64": "0.11.1-beta.1",
"@lancedb/vectordb-linux-arm64-gnu": "0.11.1-beta.1",
"@lancedb/vectordb-linux-x64-gnu": "0.11.1-beta.1",
"@lancedb/vectordb-win32-x64-msvc": "0.11.1-beta.1"
}
}

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.11.1-beta.0"
version = "0.11.1-beta.1"
license.workspace = true
description.workspace = true
repository.workspace = true
@@ -13,6 +13,7 @@ crate-type = ["cdylib"]
[dependencies]
arrow-ipc.workspace = true
env_logger.workspace = true
futures.workspace = true
lancedb = { path = "../rust/lancedb", features = ["remote"] }
napi = { version = "2.16.8", default-features = false, features = [
@@ -22,6 +23,7 @@ napi = { version = "2.16.8", default-features = false, features = [
napi-derive = "2.16.4"
# Prevent dynamic linking of lzma, which comes from datafusion
lzma-sys = { version = "*", features = ["static"] }
log.workspace = true
[build-dependencies]
napi-build = "2.1"

View File

@@ -90,4 +90,29 @@ describe("remote connection", () => {
},
);
});
it("shows the full error messages on retry errors", async () => {
await withMockDatabase(
(_req, res) => {
// We retry on 500 errors, so we return 500s until the client gives up.
res.writeHead(500).end("Internal Server Error");
},
async (db) => {
try {
await db.tableNames();
fail("expected an error");
// biome-ignore lint/suspicious/noExplicitAny: skip
} catch (e: any) {
expect(e.message).toContain("Hit retry limit for request_id=");
expect(e.message).toContain("Caused by: Http error");
expect(e.message).toContain("500 Internal Server Error");
}
},
{
clientConfig: {
retryConfig: { retries: 2 },
},
},
);
});
});

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.11.1-beta.0",
"version": "0.11.1-beta.1",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.11.1-beta.0",
"version": "0.11.1-beta.1",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.11.1-beta.0",
"version": "0.11.1-beta.1",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.11.1-beta.0",
"version": "0.11.1-beta.1",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.11.1-beta.0",
"version": "0.11.1-beta.1",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.11.0",
"version": "0.11.1-beta.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.11.0",
"version": "0.11.1-beta.1",
"cpu": [
"x64",
"arm64"

View File

@@ -10,7 +10,7 @@
"vector database",
"ann"
],
"version": "0.11.1-beta.0",
"version": "0.11.1-beta.1",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -18,6 +18,7 @@ use std::str::FromStr;
use napi::bindgen_prelude::*;
use napi_derive::*;
use crate::error::{convert_error, NapiErrorExt};
use crate::table::Table;
use crate::ConnectionOptions;
use lancedb::connection::{
@@ -86,12 +87,7 @@ impl Connection {
builder = builder.host_override(&host_override);
}
Ok(Self::inner_new(
builder
.execute()
.await
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?,
))
Ok(Self::inner_new(builder.execute().await.default_error()?))
}
#[napi]
@@ -123,9 +119,7 @@ impl Connection {
if let Some(limit) = limit {
op = op.limit(limit);
}
op.execute()
.await
.map_err(|e| napi::Error::from_reason(format!("{}", e)))
op.execute().await.default_error()
}
/// Create table from a Apache Arrow IPC (file) buffer.
@@ -156,17 +150,13 @@ impl Connection {
}
if let Some(data_storage_option) = data_storage_options.as_ref() {
builder = builder.data_storage_version(
LanceFileVersion::from_str(data_storage_option)
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?,
LanceFileVersion::from_str(data_storage_option).map_err(|e| convert_error(&e))?,
);
}
if let Some(enable_v2_manifest_paths) = enable_v2_manifest_paths {
builder = builder.enable_v2_manifest_paths(enable_v2_manifest_paths);
}
let tbl = builder
.execute()
.await
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?;
let tbl = builder.execute().await.default_error()?;
Ok(Table::new(tbl))
}
@@ -195,17 +185,13 @@ impl Connection {
}
if let Some(data_storage_option) = data_storage_options.as_ref() {
builder = builder.data_storage_version(
LanceFileVersion::from_str(data_storage_option)
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?,
LanceFileVersion::from_str(data_storage_option).map_err(|e| convert_error(&e))?,
);
}
if let Some(enable_v2_manifest_paths) = enable_v2_manifest_paths {
builder = builder.enable_v2_manifest_paths(enable_v2_manifest_paths);
}
let tbl = builder
.execute()
.await
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?;
let tbl = builder.execute().await.default_error()?;
Ok(Table::new(tbl))
}
@@ -225,19 +211,13 @@ impl Connection {
if let Some(index_cache_size) = index_cache_size {
builder = builder.index_cache_size(index_cache_size);
}
let tbl = builder
.execute()
.await
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?;
let tbl = builder.execute().await.default_error()?;
Ok(Table::new(tbl))
}
/// Drop table with the name. Or raise an error if the table does not exist.
#[napi(catch_unwind)]
pub async fn drop_table(&self, name: String) -> napi::Result<()> {
self.get_inner()?
.drop_table(&name)
.await
.map_err(|e| napi::Error::from_reason(format!("{}", e)))
self.get_inner()?.drop_table(&name).await.default_error()
}
}

View File

@@ -7,6 +7,31 @@ pub trait NapiErrorExt<T> {
impl<T> NapiErrorExt<T> for std::result::Result<T, lancedb::Error> {
fn default_error(self) -> Result<T> {
self.map_err(|err| napi::Error::from_reason(err.to_string()))
self.map_err(|err| convert_error(&err))
}
}
pub fn convert_error(err: &dyn std::error::Error) -> napi::Error {
let mut message = err.to_string();
// Append causes
let mut cause = err.source();
let mut indent = 2;
while let Some(err) = cause {
let cause_message = format!("Caused by: {}", err);
message.push_str(&indent_string(&cause_message, indent));
cause = err.source();
indent += 2;
}
napi::Error::from_reason(message)
}
fn indent_string(s: &str, amount: usize) -> String {
let indent = " ".repeat(amount);
s.lines()
.map(|line| format!("{}{}", indent, line))
.collect::<Vec<_>>()
.join("\n")
}

View File

@@ -14,6 +14,7 @@
use std::collections::HashMap;
use env_logger::Env;
use napi_derive::*;
mod connection;
@@ -77,3 +78,11 @@ pub struct WriteOptions {
pub struct OpenTableOptions {
pub storage_options: Option<HashMap<String, String>>,
}
#[napi::module_init]
fn init() {
let env = Env::new()
.filter_or("LANCEDB_LOG", "trace")
.write_style("LANCEDB_LOG_STYLE");
env_logger::init_from_env(env);
}

View File

@@ -2,6 +2,8 @@ use lancedb::{arrow::IntoArrow, ipc::ipc_file_to_batches, table::merge::MergeIns
use napi::bindgen_prelude::*;
use napi_derive::napi;
use crate::error::convert_error;
#[napi]
#[derive(Clone)]
/// A builder used to create and run a merge insert operation
@@ -35,14 +37,18 @@ impl NativeMergeInsertBuilder {
pub async fn execute(&self, buf: Buffer) -> napi::Result<()> {
let data = ipc_file_to_batches(buf.to_vec())
.and_then(IntoArrow::into_arrow)
.map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?;
.map_err(|e| {
napi::Error::from_reason(format!("Failed to read IPC file: {}", convert_error(&e)))
})?;
let this = self.clone();
this.inner
.execute(data)
.await
.map_err(|e| napi::Error::from_reason(format!("Failed to execute merge insert: {}", e)))
this.inner.execute(data).await.map_err(|e| {
napi::Error::from_reason(format!(
"Failed to execute merge insert: {}",
convert_error(&e)
))
})
}
}

View File

@@ -22,6 +22,7 @@ use lancedb::query::VectorQuery as LanceDbVectorQuery;
use napi::bindgen_prelude::*;
use napi_derive::napi;
use crate::error::convert_error;
use crate::error::NapiErrorExt;
use crate::iterator::RecordBatchIterator;
use crate::util::parse_distance_type;
@@ -93,7 +94,10 @@ impl Query {
.execute_with_options(execution_opts)
.await
.map_err(|e| {
napi::Error::from_reason(format!("Failed to execute query stream: {}", e))
napi::Error::from_reason(format!(
"Failed to execute query stream: {}",
convert_error(&e)
))
})?;
Ok(RecordBatchIterator::new(inner_stream))
}
@@ -101,7 +105,10 @@ impl Query {
#[napi]
pub async fn explain_plan(&self, verbose: bool) -> napi::Result<String> {
self.inner.explain_plan(verbose).await.map_err(|e| {
napi::Error::from_reason(format!("Failed to retrieve the query plan: {}", e))
napi::Error::from_reason(format!(
"Failed to retrieve the query plan: {}",
convert_error(&e)
))
})
}
}
@@ -190,7 +197,10 @@ impl VectorQuery {
.execute_with_options(execution_opts)
.await
.map_err(|e| {
napi::Error::from_reason(format!("Failed to execute query stream: {}", e))
napi::Error::from_reason(format!(
"Failed to execute query stream: {}",
convert_error(&e)
))
})?;
Ok(RecordBatchIterator::new(inner_stream))
}
@@ -198,7 +208,10 @@ impl VectorQuery {
#[napi]
pub async fn explain_plan(&self, verbose: bool) -> napi::Result<String> {
self.inner.explain_plan(verbose).await.map_err(|e| {
napi::Error::from_reason(format!("Failed to retrieve the query plan: {}", e))
napi::Error::from_reason(format!(
"Failed to retrieve the query plan: {}",
convert_error(&e)
))
})
}
}

View File

@@ -72,10 +72,7 @@ impl Table {
/// Return Schema as empty Arrow IPC file.
#[napi(catch_unwind)]
pub async fn schema(&self) -> napi::Result<Buffer> {
let schema =
self.inner_ref()?.schema().await.map_err(|e| {
napi::Error::from_reason(format!("Failed to create IPC file: {}", e))
})?;
let schema = self.inner_ref()?.schema().await.default_error()?;
let mut writer = FileWriter::try_new(vec![], &schema)
.map_err(|e| napi::Error::from_reason(format!("Failed to create IPC file: {}", e)))?;
writer
@@ -100,12 +97,7 @@ impl Table {
return Err(napi::Error::from_reason(format!("Invalid mode: {}", mode)));
};
op.execute().await.map_err(|e| {
napi::Error::from_reason(format!(
"Failed to add batches to table {}: {}",
self.name, e
))
})
op.execute().await.default_error()
}
#[napi(catch_unwind)]
@@ -114,22 +106,12 @@ impl Table {
.count_rows(filter)
.await
.map(|val| val as i64)
.map_err(|e| {
napi::Error::from_reason(format!(
"Failed to count rows in table {}: {}",
self.name, e
))
})
.default_error()
}
#[napi(catch_unwind)]
pub async fn delete(&self, predicate: String) -> napi::Result<()> {
self.inner_ref()?.delete(&predicate).await.map_err(|e| {
napi::Error::from_reason(format!(
"Failed to delete rows in table {}: predicate={}",
self.name, e
))
})
self.inner_ref()?.delete(&predicate).await.default_error()
}
#[napi(catch_unwind)]
@@ -187,12 +169,7 @@ impl Table {
self.inner_ref()?
.add_columns(transforms, None)
.await
.map_err(|err| {
napi::Error::from_reason(format!(
"Failed to add columns to table {}: {}",
self.name, err
))
})?;
.default_error()?;
Ok(())
}
@@ -213,12 +190,7 @@ impl Table {
self.inner_ref()?
.alter_columns(&alterations)
.await
.map_err(|err| {
napi::Error::from_reason(format!(
"Failed to alter columns in table {}: {}",
self.name, err
))
})?;
.default_error()?;
Ok(())
}
@@ -228,12 +200,7 @@ impl Table {
self.inner_ref()?
.drop_columns(&col_refs)
.await
.map_err(|err| {
napi::Error::from_reason(format!(
"Failed to drop columns from table {}: {}",
self.name, err
))
})?;
.default_error()?;
Ok(())
}

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.14.1-beta.1"
current_version = "0.15.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.14.1-beta.1"
version = "0.15.0"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true
@@ -16,7 +16,7 @@ crate-type = ["cdylib"]
[dependencies]
arrow = { version = "52.1", features = ["pyarrow"] }
lancedb = { path = "../rust/lancedb" }
env_logger = "0.10"
env_logger.workspace = true
pyo3 = { version = "0.21", features = ["extension-module", "abi3-py38", "gil-refs"] }
# Using this fork for now: https://github.com/awestlake87/pyo3-asyncio/issues/119
# pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }

View File

@@ -3,7 +3,7 @@ name = "lancedb"
# version in Cargo.toml
dependencies = [
"deprecation",
"pylance==0.18.3-beta.2",
"pylance==0.19.1",
"requests>=2.31.0",
"tqdm>=4.27.0",
"pydantic>=1.10",

View File

@@ -26,7 +26,7 @@ registry = EmbeddingFunctionRegistry.get_instance()
@registry.register("test")
class MockTextEmbeddingFunction(TextEmbeddingFunction):
"""
Return the hash of the first 10 characters
Return the hash of the first 10 characters (normalized)
"""
def generate_embeddings(self, texts):
@@ -41,6 +41,23 @@ class MockTextEmbeddingFunction(TextEmbeddingFunction):
return 10
@registry.register("nonnorm")
class MockNonNormTextEmbeddingFunction(TextEmbeddingFunction):
"""
Return the ord of the first 10 characters (not normalized)
"""
def generate_embeddings(self, texts):
return [self._compute_one_embedding(row) for row in texts]
def _compute_one_embedding(self, row):
emb = np.array([float(ord(c)) for c in row[:10]])
return emb if len(emb) == 10 else [0] * 10
def ndims(self):
return 10
class RateLimitedAPI:
rate_limit = 0.1 # 1 request per 0.1 second
last_request_time = 0

View File

@@ -7,6 +7,27 @@ from ._lancedb import (
IndexConfig,
)
lang_mapping = {
"ar": "Arabic",
"da": "Danish",
"du": "Dutch",
"en": "English",
"fi": "Finnish",
"fr": "French",
"de": "German",
"gr": "Greek",
"hu": "Hungarian",
"it": "Italian",
"no": "Norwegian",
"pt": "Portuguese",
"ro": "Romanian",
"ru": "Russian",
"es": "Spanish",
"sv": "Swedish",
"ta": "Tamil",
"tr": "Turkish",
}
class BTree:
"""Describes a btree index configuration
@@ -78,7 +99,17 @@ class FTS:
For example, it works with `title`, `description`, `content`, etc.
"""
def __init__(self, with_position: bool = True):
def __init__(
self,
with_position: bool = True,
base_tokenizer: str = "simple",
language: str = "English",
max_token_length: Optional[int] = 40,
lower_case: bool = True,
stem: bool = False,
remove_stop_words: bool = False,
ascii_folding: bool = False,
):
self._inner = LanceDbIndex.fts(with_position=with_position)

View File

@@ -983,6 +983,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
self._reranker = RRFReranker()
self._nprobes = None
self._refine_factor = None
self._metric = None
self._phrase_query = False
def _validate_query(self, query, vector=None, text=None):
@@ -1050,6 +1051,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
self._fts_query.with_row_id(True)
if self._phrase_query:
self._fts_query.phrase_query(True)
if self._metric:
self._vector_query.metric(self._metric)
if self._nprobes:
self._vector_query.nprobes(self._nprobes)
if self._refine_factor:
@@ -1067,6 +1070,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
if self._norm == "rank":
vector_results = self._rank(vector_results, "_distance")
fts_results = self._rank(fts_results, "_score")
# normalize the scores to be between 0 and 1, 0 being most relevant
vector_results = self._normalize_scores(vector_results, "_distance")
@@ -1115,7 +1119,9 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
rng = max
else:
rng = max - min
scores = (scores - min) / rng
# If rng is 0 then min and max are both 0 and so we can leave the scores as is
if rng != 0:
scores = (scores - min) / rng
if invert:
scores = 1 - scores
# replace the _score column with the ranks
@@ -1177,6 +1183,22 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
self._nprobes = nprobes
return self
def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceHybridQueryBuilder:
"""Set the distance metric to use.
Parameters
----------
metric: "L2" or "cosine" or "dot"
The distance metric to use. By default "L2" is used.
Returns
-------
LanceVectorQueryBuilder
The LanceQueryBuilder object.
"""
self._metric = metric.lower()
return self
def refine_factor(self, refine_factor: int) -> LanceHybridQueryBuilder:
"""
Refine the vector search results by reading extra elements and

View File

@@ -55,6 +55,7 @@ from .util import (
safe_import_polars,
value_to_sql,
)
from .index import lang_mapping
if TYPE_CHECKING:
import PIL
@@ -497,10 +498,18 @@ class Table(ABC):
ordering_field_names: Union[str, List[str]] = None,
*,
replace: bool = False,
with_position: bool = True,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
tokenizer_name: str = "default",
use_tantivy: bool = True,
tokenizer_name: Optional[str] = None,
with_position: bool = True,
# tokenizer configs:
base_tokenizer: str = "simple",
language: str = "English",
max_token_length: Optional[int] = 40,
lower_case: bool = True,
stem: bool = False,
remove_stop_words: bool = False,
ascii_folding: bool = False,
):
"""Create a full-text search index on the table.
@@ -526,7 +535,6 @@ class Table(ABC):
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
language code followed by "_stem". So for english it would be "en_stem".
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
only available with use_tantivy=True for now
use_tantivy: bool, default True
If True, use the legacy full-text search implementation based on tantivy.
If False, use the new full-text search implementation based on lance-index.
@@ -1341,14 +1349,33 @@ class LanceTable(Table):
ordering_field_names: Union[str, List[str]] = None,
*,
replace: bool = False,
with_position: bool = True,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
tokenizer_name: str = "default",
use_tantivy: bool = True,
tokenizer_name: Optional[str] = None,
with_position: bool = True,
# tokenizer configs:
base_tokenizer: str = "simple",
language: str = "English",
max_token_length: Optional[int] = 40,
lower_case: bool = True,
stem: bool = False,
remove_stop_words: bool = False,
ascii_folding: bool = False,
):
if not use_tantivy:
if not isinstance(field_names, str):
raise ValueError("field_names must be a string when use_tantivy=False")
tokenizer_configs = {
"base_tokenizer": base_tokenizer,
"language": language,
"max_token_length": max_token_length,
"lower_case": lower_case,
"stem": stem,
"remove_stop_words": remove_stop_words,
"ascii_folding": ascii_folding,
}
if tokenizer_name is not None:
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
# delete the existing legacy index if it exists
if replace:
path, fs, exist = self._get_fts_index_path()
@@ -1359,6 +1386,7 @@ class LanceTable(Table):
index_type="INVERTED",
replace=replace,
with_position=with_position,
**tokenizer_configs,
)
return
@@ -1381,6 +1409,8 @@ class LanceTable(Table):
"Full-text search is only supported on the local filesystem"
)
if tokenizer_name is None:
tokenizer_name = "default"
index = create_index(
path,
field_names,
@@ -1395,6 +1425,56 @@ class LanceTable(Table):
writer_heap_size=writer_heap_size,
)
def infer_tokenizer_configs(tokenizer_name: str) -> dict:
if tokenizer_name == "default":
return {
"base_tokenizer": "simple",
"language": "English",
"max_token_length": 40,
"lower_case": True,
"stem": False,
"remove_stop_words": False,
"ascii_folding": False,
}
elif tokenizer_name == "raw":
return {
"base_tokenizer": "raw",
"language": "English",
"max_token_length": None,
"lower_case": False,
"stem": False,
"remove_stop_words": False,
"ascii_folding": False,
}
elif tokenizer_name == "whitespace":
return {
"base_tokenizer": "whitespace",
"language": "English",
"max_token_length": None,
"lower_case": False,
"stem": False,
"remove_stop_words": False,
"ascii_folding": False,
}
# or it's with language stemming with pattern like "en_stem"
if len(tokenizer_name) != 7:
raise ValueError(f"Invalid tokenizer name {tokenizer_name}")
lang = tokenizer_name[:2]
if tokenizer_name[-5:] != "_stem":
raise ValueError(f"Invalid tokenizer name {tokenizer_name}")
if lang not in lang_mapping:
raise ValueError(f"Invalid language code {lang}")
return {
"base_tokenizer": "simple",
"language": lang_mapping[lang],
"max_token_length": 40,
"lower_case": True,
"stem": True,
"remove_stop_words": False,
"ascii_folding": False,
}
def add(
self,
data: DATA,

View File

@@ -991,13 +991,10 @@ def test_count_rows(db):
assert table.count_rows(filter="text='bar'") == 1
def test_hybrid_search(db, tmp_path):
# This test uses an FTS index
pytest.importorskip("lancedb.fts")
def setup_hybrid_search_table(tmp_path, embedding_func):
db = MockDB(str(tmp_path))
# Create a LanceDB table schema with a vector and a text column
emb = EmbeddingFunctionRegistry.get_instance().get("test")()
emb = EmbeddingFunctionRegistry.get_instance().get(embedding_func)()
class MyTable(LanceModel):
text: str = emb.SourceField()
@@ -1030,6 +1027,15 @@ def test_hybrid_search(db, tmp_path):
# Create a fts index
table.create_fts_index("text")
return table, MyTable, emb
def test_hybrid_search(tmp_path):
# This test uses an FTS index
pytest.importorskip("lancedb.fts")
table, MyTable, emb = setup_hybrid_search_table(tmp_path, "test")
result1 = (
table.search("Our father who art in heaven", query_type="hybrid")
.rerank(normalize="score")
@@ -1094,6 +1100,24 @@ def test_hybrid_search(db, tmp_path):
table.search(query_type="hybrid").text("Arrrrggghhhhhhh").to_list()
def test_hybrid_search_metric_type(db, tmp_path):
# This test uses an FTS index
pytest.importorskip("lancedb.fts")
# Need to use nonnorm as the embedding function so L2 and dot results
# are different
table, _, _ = setup_hybrid_search_table(tmp_path, "nonnorm")
# with custom metric
result_dot = (
table.search("feeling lucky", query_type="hybrid").metric("dot").to_arrow()
)
result_l2 = table.search("feeling lucky", query_type="hybrid").to_arrow()
assert len(result_dot) > 0
assert len(result_l2) > 0
assert result_dot["_relevance_score"] != result_l2["_relevance_score"]
@pytest.mark.parametrize(
"consistency_interval", [None, timedelta(seconds=0), timedelta(seconds=0.1)]
)

View File

@@ -106,12 +106,41 @@ impl Index {
})
}
#[allow(clippy::too_many_arguments)]
#[staticmethod]
pub fn fts(with_position: Option<bool>) -> Self {
pub fn fts(
with_position: Option<bool>,
base_tokenizer: Option<String>,
language: Option<String>,
max_token_length: Option<usize>,
lower_case: Option<bool>,
stem: Option<bool>,
remove_stop_words: Option<bool>,
ascii_folding: Option<bool>,
) -> Self {
let mut opts = FtsIndexBuilder::default();
if let Some(with_position) = with_position {
opts = opts.with_position(with_position);
}
if let Some(base_tokenizer) = base_tokenizer {
opts.tokenizer_configs = opts.tokenizer_configs.base_tokenizer(base_tokenizer);
}
if let Some(language) = language {
opts.tokenizer_configs = opts.tokenizer_configs.language(&language).unwrap();
}
opts.tokenizer_configs = opts.tokenizer_configs.max_token_length(max_token_length);
if let Some(lower_case) = lower_case {
opts.tokenizer_configs = opts.tokenizer_configs.lower_case(lower_case);
}
if let Some(stem) = stem {
opts.tokenizer_configs = opts.tokenizer_configs.stem(stem);
}
if let Some(remove_stop_words) = remove_stop_words {
opts.tokenizer_configs = opts.tokenizer_configs.remove_stop_words(remove_stop_words);
}
if let Some(ascii_folding) = ascii_folding {
opts.tokenizer_configs = opts.tokenizer_configs.ascii_folding(ascii_folding);
}
Self {
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
}

View File

@@ -1,2 +1,2 @@
[toolchain]
channel = "1.79.0"
channel = "1.80.0"

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-node"
version = "0.11.1-beta.0"
version = "0.11.1-beta.1"
description = "Serverless, low-latency vector database for AI applications"
license.workspace = true
edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.11.1-beta.0"
version = "0.11.1-beta.1"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true

View File

@@ -54,12 +54,14 @@ pub struct LabelListIndexBuilder {}
#[derive(Debug, Clone)]
pub struct FtsIndexBuilder {
pub(crate) with_position: bool,
pub tokenizer_configs: TokenizerConfig,
}
impl Default for FtsIndexBuilder {
fn default() -> Self {
Self {
with_position: true,
tokenizer_configs: TokenizerConfig::default(),
}
}
}
@@ -72,4 +74,5 @@ impl FtsIndexBuilder {
}
}
use lance_index::scalar::inverted::TokenizerConfig;
pub use lance_index::scalar::FullTextSearchQuery;

View File

@@ -266,6 +266,7 @@ impl RestfulLanceDbClient<Sender> {
Some(host_override) => host_override,
None => format!("https://{}.{}.api.lancedb.com", db_name, region),
};
debug!("Created client for host: {}", host);
let retry_config = client_config.retry_config.try_into()?;
Ok(Self {
client,
@@ -340,6 +341,8 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
request_id
};
debug!("Sending request_id={}: {:?}", request_id, &request);
if with_retry {
self.send_with_retry_impl(client, request, request_id).await
} else {
@@ -348,6 +351,10 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
.send(&client, request)
.await
.err_to_http(request_id.clone())?;
debug!(
"Received response for request_id={}: {:?}",
request_id, &response
);
Ok((request_id, response))
}
}
@@ -374,7 +381,11 @@ impl<S: HttpSend> RestfulLanceDbClient<S> {
.map(|r| (r.status(), r));
match response {
Ok((status, response)) if status.is_success() => {
return Ok((retry_counter.request_id, response))
debug!(
"Received response for request_id={}: {:?}",
retry_counter.request_id, &response
);
return Ok((retry_counter.request_id, response));
}
Ok((status, response)) if self.retry_config.statuses.contains(&status) => {
let source = self

View File

@@ -1568,6 +1568,7 @@ impl NativeTable {
let mut dataset = self.dataset.get_mut().await?;
let fts_params = lance_index::scalar::InvertedIndexParams {
with_position: fts_opts.with_position,
tokenizer_config: fts_opts.tokenizer_configs,
};
dataset
.create_index(
@@ -2002,7 +2003,7 @@ impl TableInternal for NativeTable {
self.dataset
.get_mut()
.await?
.add_columns(transforms, read_columns)
.add_columns(transforms, read_columns, None)
.await?;
Ok(())
}