mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-25 22:29:58 +00:00
feat!: upgrade lance to 0.19.1 (#1762)
BREAKING CHANGE: default tokenizer no longer does stemming or stop-word removal. Users should explicitly turn that option on in the future. - upgrade lance to 0.19.1 - update the FTS docs - update the FTS API Upstream change notes: https://github.com/lancedb/lance/releases/tag/v0.19.1 --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com> Co-authored-by: Will Jones <willjones127@gmail.com>
This commit is contained in:
17
Cargo.toml
17
Cargo.toml
@@ -18,17 +18,16 @@ repository = "https://github.com/lancedb/lancedb"
|
|||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
keywords = ["lancedb", "lance", "database", "vector", "search"]
|
keywords = ["lancedb", "lance", "database", "vector", "search"]
|
||||||
categories = ["database-implementations"]
|
categories = ["database-implementations"]
|
||||||
|
rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.18.3", "features" = [
|
lance = { "version" = "=0.19.1", "features" = ["dynamodb"] }
|
||||||
"dynamodb",
|
lance-index = { "version" = "=0.19.1" }
|
||||||
], git = "https://github.com/lancedb/lance.git", tag = "v0.18.3-beta.2" }
|
lance-linalg = { "version" = "=0.19.1" }
|
||||||
lance-index = { "version" = "=0.18.3", git = "https://github.com/lancedb/lance.git", tag = "v0.18.3-beta.2" }
|
lance-table = { "version" = "=0.19.1" }
|
||||||
lance-linalg = { "version" = "=0.18.3", git = "https://github.com/lancedb/lance.git", tag = "v0.18.3-beta.2" }
|
lance-testing = { "version" = "=0.19.1" }
|
||||||
lance-table = { "version" = "=0.18.3", git = "https://github.com/lancedb/lance.git", tag = "v0.18.3-beta.2" }
|
lance-datafusion = { "version" = "=0.19.1" }
|
||||||
lance-testing = { "version" = "=0.18.3", git = "https://github.com/lancedb/lance.git", tag = "v0.18.3-beta.2" }
|
lance-encoding = { "version" = "=0.19.1" }
|
||||||
lance-datafusion = { "version" = "=0.18.3", git = "https://github.com/lancedb/lance.git", tag = "v0.18.3-beta.2" }
|
|
||||||
lance-encoding = { "version" = "=0.18.3", git = "https://github.com/lancedb/lance.git", tag = "v0.18.3-beta.2" }
|
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "52.2", optional = false }
|
arrow = { version = "52.2", optional = false }
|
||||||
arrow-array = "52.2"
|
arrow-array = "52.2"
|
||||||
|
|||||||
@@ -100,7 +100,7 @@ nav:
|
|||||||
- 🏃🏼♂️ Quick start: basic.md
|
- 🏃🏼♂️ Quick start: basic.md
|
||||||
- 📚 Concepts:
|
- 📚 Concepts:
|
||||||
- Vector search: concepts/vector_search.md
|
- Vector search: concepts/vector_search.md
|
||||||
- Indexing:
|
- Indexing:
|
||||||
- IVFPQ: concepts/index_ivfpq.md
|
- IVFPQ: concepts/index_ivfpq.md
|
||||||
- HNSW: concepts/index_hnsw.md
|
- HNSW: concepts/index_hnsw.md
|
||||||
- Storage: concepts/storage.md
|
- Storage: concepts/storage.md
|
||||||
@@ -109,7 +109,8 @@ nav:
|
|||||||
- Working with tables: guides/tables.md
|
- Working with tables: guides/tables.md
|
||||||
- Building a vector index: ann_indexes.md
|
- Building a vector index: ann_indexes.md
|
||||||
- Vector Search: search.md
|
- Vector Search: search.md
|
||||||
- Full-text search: fts.md
|
- Full-text search (native): fts.md
|
||||||
|
- Full-text search (tantivy-based): fts_tantivy.md
|
||||||
- Building a scalar index: guides/scalar_index.md
|
- Building a scalar index: guides/scalar_index.md
|
||||||
- Hybrid search:
|
- Hybrid search:
|
||||||
- Overview: hybrid_search/hybrid_search.md
|
- Overview: hybrid_search/hybrid_search.md
|
||||||
@@ -148,10 +149,10 @@ nav:
|
|||||||
- Reranking: guides/tuning_retrievers/2_reranking.md
|
- Reranking: guides/tuning_retrievers/2_reranking.md
|
||||||
- Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md
|
- Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md
|
||||||
- 🧬 Managing embeddings:
|
- 🧬 Managing embeddings:
|
||||||
- Understand Embeddings: embeddings/understanding_embeddings.md
|
- Understand Embeddings: embeddings/understanding_embeddings.md
|
||||||
- Get Started: embeddings/index.md
|
- Get Started: embeddings/index.md
|
||||||
- Embedding functions: embeddings/embedding_functions.md
|
- Embedding functions: embeddings/embedding_functions.md
|
||||||
- Available models:
|
- Available models:
|
||||||
- Overview: embeddings/default_embedding_functions.md
|
- Overview: embeddings/default_embedding_functions.md
|
||||||
- Text Embedding Functions:
|
- Text Embedding Functions:
|
||||||
- Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md
|
- Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md
|
||||||
@@ -200,7 +201,7 @@ nav:
|
|||||||
- Evaluation: examples/python_examples/evaluations.md
|
- Evaluation: examples/python_examples/evaluations.md
|
||||||
- AI Agent: examples/python_examples/aiagent.md
|
- AI Agent: examples/python_examples/aiagent.md
|
||||||
- Recommender System: examples/python_examples/recommendersystem.md
|
- Recommender System: examples/python_examples/recommendersystem.md
|
||||||
- Miscellaneous:
|
- Miscellaneous:
|
||||||
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
|
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
|
||||||
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
|
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
|
||||||
- 👾 JavaScript:
|
- 👾 JavaScript:
|
||||||
@@ -228,7 +229,7 @@ nav:
|
|||||||
- Quick start: basic.md
|
- Quick start: basic.md
|
||||||
- Concepts:
|
- Concepts:
|
||||||
- Vector search: concepts/vector_search.md
|
- Vector search: concepts/vector_search.md
|
||||||
- Indexing:
|
- Indexing:
|
||||||
- IVFPQ: concepts/index_ivfpq.md
|
- IVFPQ: concepts/index_ivfpq.md
|
||||||
- HNSW: concepts/index_hnsw.md
|
- HNSW: concepts/index_hnsw.md
|
||||||
- Storage: concepts/storage.md
|
- Storage: concepts/storage.md
|
||||||
@@ -237,7 +238,8 @@ nav:
|
|||||||
- Working with tables: guides/tables.md
|
- Working with tables: guides/tables.md
|
||||||
- Building an ANN index: ann_indexes.md
|
- Building an ANN index: ann_indexes.md
|
||||||
- Vector Search: search.md
|
- Vector Search: search.md
|
||||||
- Full-text search: fts.md
|
- Full-text search (native): fts.md
|
||||||
|
- Full-text search (tantivy-based): fts_tantivy.md
|
||||||
- Building a scalar index: guides/scalar_index.md
|
- Building a scalar index: guides/scalar_index.md
|
||||||
- Hybrid search:
|
- Hybrid search:
|
||||||
- Overview: hybrid_search/hybrid_search.md
|
- Overview: hybrid_search/hybrid_search.md
|
||||||
@@ -276,10 +278,10 @@ nav:
|
|||||||
- Reranking: guides/tuning_retrievers/2_reranking.md
|
- Reranking: guides/tuning_retrievers/2_reranking.md
|
||||||
- Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md
|
- Embedding fine-tuning: guides/tuning_retrievers/3_embed_tuning.md
|
||||||
- Managing Embeddings:
|
- Managing Embeddings:
|
||||||
- Understand Embeddings: embeddings/understanding_embeddings.md
|
- Understand Embeddings: embeddings/understanding_embeddings.md
|
||||||
- Get Started: embeddings/index.md
|
- Get Started: embeddings/index.md
|
||||||
- Embedding functions: embeddings/embedding_functions.md
|
- Embedding functions: embeddings/embedding_functions.md
|
||||||
- Available models:
|
- Available models:
|
||||||
- Overview: embeddings/default_embedding_functions.md
|
- Overview: embeddings/default_embedding_functions.md
|
||||||
- Text Embedding Functions:
|
- Text Embedding Functions:
|
||||||
- Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md
|
- Sentence Transformers: embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md
|
||||||
@@ -324,7 +326,7 @@ nav:
|
|||||||
- Evaluation: examples/python_examples/evaluations.md
|
- Evaluation: examples/python_examples/evaluations.md
|
||||||
- AI Agent: examples/python_examples/aiagent.md
|
- AI Agent: examples/python_examples/aiagent.md
|
||||||
- Recommender System: examples/python_examples/recommendersystem.md
|
- Recommender System: examples/python_examples/recommendersystem.md
|
||||||
- Miscellaneous:
|
- Miscellaneous:
|
||||||
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
|
- Serverless QA Bot with S3 and Lambda: examples/serverless_lancedb_with_s3_and_lambda.md
|
||||||
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
|
- Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
|
||||||
- 👾 JavaScript:
|
- 👾 JavaScript:
|
||||||
@@ -367,5 +369,4 @@ extra:
|
|||||||
- icon: fontawesome/brands/x-twitter
|
- icon: fontawesome/brands/x-twitter
|
||||||
link: https://twitter.com/lancedb
|
link: https://twitter.com/lancedb
|
||||||
- icon: fontawesome/brands/linkedin
|
- icon: fontawesome/brands/linkedin
|
||||||
link: https://www.linkedin.com/company/lancedb
|
link: https://www.linkedin.com/company/lancedb
|
||||||
|
|
||||||
|
|||||||
158
docs/src/fts.md
158
docs/src/fts.md
@@ -1,21 +1,9 @@
|
|||||||
# Full-text search
|
# Full-text search (Native FTS)
|
||||||
|
|
||||||
LanceDB provides support for full-text search via Lance (before via [Tantivy](https://github.com/quickwit-oss/tantivy) (Python only)), allowing you to incorporate keyword-based search (based on BM25) in your retrieval solutions.
|
LanceDB provides support for full-text search via Lance, allowing you to incorporate keyword-based search (based on BM25) in your retrieval solutions.
|
||||||
|
|
||||||
Currently, the Lance full text search is missing some features that are in the Tantivy full text search. This includes query parser and customizing the tokenizer. Thus, in Python, Tantivy is still the default way to do full text search and many of the instructions below apply just to Tantivy-based indices.
|
|
||||||
|
|
||||||
|
|
||||||
## Installation (Only for Tantivy-based FTS)
|
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
No need to install the tantivy dependency if using native FTS
|
The Python SDK uses tantivy-based FTS by default, need to pass `use_tantivy=False` to use native FTS.
|
||||||
|
|
||||||
To use full-text search, install the dependency [`tantivy-py`](https://github.com/quickwit-oss/tantivy-py):
|
|
||||||
|
|
||||||
```sh
|
|
||||||
# Say you want to use tantivy==0.20.1
|
|
||||||
pip install tantivy==0.20.1
|
|
||||||
```
|
|
||||||
|
|
||||||
## Example
|
## Example
|
||||||
|
|
||||||
@@ -39,7 +27,7 @@ Consider that we have a LanceDB table named `my_table`, whose string column `tex
|
|||||||
|
|
||||||
# passing `use_tantivy=False` to use lance FTS index
|
# passing `use_tantivy=False` to use lance FTS index
|
||||||
# `use_tantivy=True` by default
|
# `use_tantivy=True` by default
|
||||||
table.create_fts_index("text")
|
table.create_fts_index("text", use_tantivy=False)
|
||||||
table.search("puppy").limit(10).select(["text"]).to_list()
|
table.search("puppy").limit(10).select(["text"]).to_list()
|
||||||
# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}]
|
# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}]
|
||||||
# ...
|
# ...
|
||||||
@@ -93,51 +81,40 @@ Consider that we have a LanceDB table named `my_table`, whose string column `tex
|
|||||||
```
|
```
|
||||||
|
|
||||||
It would search on all indexed columns by default, so it's useful when there are multiple indexed columns.
|
It would search on all indexed columns by default, so it's useful when there are multiple indexed columns.
|
||||||
For now, this is supported in tantivy way only.
|
|
||||||
|
|
||||||
Passing `fts_columns="text"` if you want to specify the columns to search, but it's not available for Tantivy-based full text search.
|
Passing `fts_columns="text"` if you want to specify the columns to search.
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead.
|
LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead.
|
||||||
|
|
||||||
## Tokenization
|
## Tokenization
|
||||||
By default the text is tokenized by splitting on punctuation and whitespaces and then removing tokens that are longer than 40 chars. For more language specific tokenization then provide the argument tokenizer_name with the 2 letter language code followed by "_stem". So for english it would be "en_stem".
|
By default the text is tokenized by splitting on punctuation and whitespaces, and would filter out words that are with length greater than 40, and lowercase all words.
|
||||||
|
|
||||||
For now, only the Tantivy-based FTS index supports to specify the tokenizer, so it's only available in Python with `use_tantivy=True`.
|
Stemming is useful for improving search results by reducing words to their root form, e.g. "running" to "run". LanceDB supports stemming for multiple languages, you can specify the tokenizer name to enable stemming by the pattern `tokenizer_name="{language_code}_stem"`, e.g. `en_stem` for English.
|
||||||
|
|
||||||
=== "use_tantivy=True"
|
For example, to enable stemming for English:
|
||||||
|
```python
|
||||||
```python
|
table.create_fts_index("text", use_tantivy=True, tokenizer_name="en_stem")
|
||||||
table.create_fts_index("text", use_tantivy=True, tokenizer_name="en_stem")
|
```
|
||||||
```
|
|
||||||
|
|
||||||
=== "use_tantivy=False"
|
|
||||||
|
|
||||||
[**Not supported yet**](https://github.com/lancedb/lance/issues/1195)
|
|
||||||
|
|
||||||
the following [languages](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) are currently supported.
|
the following [languages](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) are currently supported.
|
||||||
|
|
||||||
## Index multiple columns
|
The tokenizer is customizable, you can specify how the tokenizer splits the text, and how it filters out words, etc.
|
||||||
|
|
||||||
If you have multiple string columns to index, there's no need to combine them manually -- simply pass them all as a list to `create_fts_index`:
|
For example, for language with accents, you can specify the tokenizer to use `ascii_folding` to remove accents, e.g. 'é' to 'e':
|
||||||
|
```python
|
||||||
=== "use_tantivy=True"
|
table.create_fts_index("text",
|
||||||
|
use_tantivy=False,
|
||||||
```python
|
language="French",
|
||||||
table.create_fts_index(["text1", "text2"])
|
stem=True,
|
||||||
```
|
ascii_folding=True)
|
||||||
|
```
|
||||||
=== "use_tantivy=False"
|
|
||||||
|
|
||||||
[**Not supported yet**](https://github.com/lancedb/lance/issues/1195)
|
|
||||||
|
|
||||||
Note that the search API call does not change - you can search over all indexed columns at once.
|
|
||||||
|
|
||||||
## Filtering
|
## Filtering
|
||||||
|
|
||||||
Currently the LanceDB full text search feature supports *post-filtering*, meaning filters are
|
LanceDB full text search supports to filter the search results by a condition, both pre-filtering and post-filtering are supported.
|
||||||
applied on top of the full text search results. This can be invoked via the familiar
|
|
||||||
`where` syntax:
|
This can be invoked via the familiar `where` syntax:
|
||||||
|
|
||||||
=== "Python"
|
=== "Python"
|
||||||
|
|
||||||
@@ -169,98 +146,17 @@ applied on top of the full text search results. This can be invoked via the fami
|
|||||||
.await?;
|
.await?;
|
||||||
```
|
```
|
||||||
|
|
||||||
## Sorting
|
|
||||||
|
|
||||||
!!! warning "Warn"
|
|
||||||
Sorting is available for only Tantivy-based FTS
|
|
||||||
|
|
||||||
You can pre-sort the documents by specifying `ordering_field_names` when
|
|
||||||
creating the full-text search index. Once pre-sorted, you can then specify
|
|
||||||
`ordering_field_name` while searching to return results sorted by the given
|
|
||||||
field. For example,
|
|
||||||
|
|
||||||
```python
|
|
||||||
table.create_fts_index(["text_field"], use_tantivy=True, ordering_field_names=["sort_by_field"])
|
|
||||||
|
|
||||||
(table.search("terms", ordering_field_name="sort_by_field")
|
|
||||||
.limit(20)
|
|
||||||
.to_list())
|
|
||||||
```
|
|
||||||
|
|
||||||
!!! note
|
|
||||||
If you wish to specify an ordering field at query time, you must also
|
|
||||||
have specified it during indexing time. Otherwise at query time, an
|
|
||||||
error will be raised that looks like `ValueError: The field does not exist: xxx`
|
|
||||||
|
|
||||||
!!! note
|
|
||||||
The fields to sort on must be of typed unsigned integer, or else you will see
|
|
||||||
an error during indexing that looks like
|
|
||||||
`TypeError: argument 'value': 'float' object cannot be interpreted as an integer`.
|
|
||||||
|
|
||||||
!!! note
|
|
||||||
You can specify multiple fields for ordering at indexing time.
|
|
||||||
But at query time only one ordering field is supported.
|
|
||||||
|
|
||||||
|
|
||||||
## Phrase queries vs. terms queries
|
## Phrase queries vs. terms queries
|
||||||
|
|
||||||
!!! warning "Warn"
|
!!! warning "Warn"
|
||||||
Lance-based FTS doesn't support queries using boolean operators `OR`, `AND`.
|
Lance-based FTS doesn't support queries using boolean operators `OR`, `AND`.
|
||||||
|
|
||||||
For full-text search you can specify either a **phrase** query like `"the old man and the sea"`,
|
For full-text search you can specify either a **phrase** query like `"the old man and the sea"`,
|
||||||
or a **terms** search query like `"(Old AND Man) AND Sea"`. For more details on the terms
|
or a **terms** search query like `old man sea`. For more details on the terms
|
||||||
query syntax, see Tantivy's [query parser rules](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html).
|
query syntax, see Tantivy's [query parser rules](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html).
|
||||||
|
|
||||||
!!! tip "Note"
|
To search for a phrase, the index must be created with `with_position=True`:
|
||||||
The query parser will raise an exception on queries that are ambiguous. For example, in the query `they could have been dogs OR cats`, `OR` is capitalized so it's considered a keyword query operator. But it's ambiguous how the left part should be treated. So if you submit this search query as is, you'll get `Syntax Error: they could have been dogs OR cats`.
|
|
||||||
|
|
||||||
```py
|
|
||||||
# This raises a syntax error
|
|
||||||
table.search("they could have been dogs OR cats")
|
|
||||||
```
|
|
||||||
|
|
||||||
On the other hand, lowercasing `OR` to `or` will work, because there are no capitalized logical operators and
|
|
||||||
the query is treated as a phrase query.
|
|
||||||
|
|
||||||
```py
|
|
||||||
# This works!
|
|
||||||
table.search("they could have been dogs or cats")
|
|
||||||
```
|
|
||||||
|
|
||||||
It can be cumbersome to have to remember what will cause a syntax error depending on the type of
|
|
||||||
query you want to perform. To make this simpler, when you want to perform a phrase query, you can
|
|
||||||
enforce it in one of two ways:
|
|
||||||
|
|
||||||
1. Place the double-quoted query inside single quotes. For example, `table.search('"they could have been dogs OR cats"')` is treated as
|
|
||||||
a phrase query.
|
|
||||||
1. Explicitly declare the `phrase_query()` method. This is useful when you have a phrase query that
|
|
||||||
itself contains double quotes. For example, `table.search('the cats OR dogs were not really "pets" at all').phrase_query()`
|
|
||||||
is treated as a phrase query.
|
|
||||||
|
|
||||||
In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested
|
|
||||||
double quotes replaced by single quotes.
|
|
||||||
|
|
||||||
|
|
||||||
## Configurations (Only for Tantivy-based FTS)
|
|
||||||
|
|
||||||
By default, LanceDB configures a 1GB heap size limit for creating the index. You can
|
|
||||||
reduce this if running on a smaller node, or increase this for faster performance while
|
|
||||||
indexing a larger corpus.
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# configure a 512MB heap size
|
table.create_fts_index("text", use_tantivy=False, with_position=True)
|
||||||
heap = 1024 * 1024 * 512
|
|
||||||
table.create_fts_index(["text1", "text2"], writer_heap_size=heap, replace=True)
|
|
||||||
```
|
```
|
||||||
|
This will allow you to search for phrases, but it will also significantly increase the index size and indexing time.
|
||||||
## Current limitations
|
|
||||||
|
|
||||||
For that Tantivy-based FTS:
|
|
||||||
|
|
||||||
1. Currently we do not yet support incremental writes.
|
|
||||||
If you add data after FTS index creation, it won't be reflected
|
|
||||||
in search results until you do a full reindex.
|
|
||||||
|
|
||||||
2. We currently only support local filesystem paths for the FTS index.
|
|
||||||
This is a tantivy limitation. We've implemented an object store plugin
|
|
||||||
but there's no way in tantivy-py to specify to use it.
|
|
||||||
|
|||||||
162
docs/src/fts_tantivy.md
Normal file
162
docs/src/fts_tantivy.md
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
# Full-text search (Tantivy-based FTS)
|
||||||
|
|
||||||
|
LanceDB also provides support for full-text search via [Tantivy](https://github.com/quickwit-oss/tantivy), allowing you to incorporate keyword-based search (based on BM25) in your retrieval solutions.
|
||||||
|
|
||||||
|
The tantivy-based FTS is only available in Python and does not support building indexes on object storage or incremental indexing. If you need these features, try native FTS [native FTS](fts.md).
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
To use full-text search, install the dependency [`tantivy-py`](https://github.com/quickwit-oss/tantivy-py):
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Say you want to use tantivy==0.20.1
|
||||||
|
pip install tantivy==0.20.1
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
Consider that we have a LanceDB table named `my_table`, whose string column `content` we want to index and query via keyword search, the FTS index must be created before you can search via keywords.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import lancedb
|
||||||
|
|
||||||
|
uri = "data/sample-lancedb"
|
||||||
|
db = lancedb.connect(uri)
|
||||||
|
|
||||||
|
table = db.create_table(
|
||||||
|
"my_table",
|
||||||
|
data=[
|
||||||
|
{"id": 1, "vector": [3.1, 4.1], "title": "happy puppy", "content": "Frodo was a happy puppy", "meta": "foo"},
|
||||||
|
{"id": 2, "vector": [5.9, 26.5], "title": "playing kittens", "content": "There are several kittens playing around the puppy", "meta": "bar"},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# passing `use_tantivy=False` to use lance FTS index
|
||||||
|
# `use_tantivy=True` by default
|
||||||
|
table.create_fts_index("content", use_tantivy=True)
|
||||||
|
table.search("puppy").limit(10).select(["content"]).to_list()
|
||||||
|
# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}]
|
||||||
|
# ...
|
||||||
|
```
|
||||||
|
|
||||||
|
It would search on all indexed columns by default, so it's useful when there are multiple indexed columns.
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
LanceDB automatically searches on the existing FTS index if the input to the search is of type `str`. If you provide a vector as input, LanceDB will search the ANN index instead.
|
||||||
|
|
||||||
|
## Tokenization
|
||||||
|
By default the text is tokenized by splitting on punctuation and whitespaces and then removing tokens that are longer than 40 chars. For more language specific tokenization then provide the argument tokenizer_name with the 2 letter language code followed by "_stem". So for english it would be "en_stem".
|
||||||
|
|
||||||
|
```python
|
||||||
|
table.create_fts_index("content", use_tantivy=True, tokenizer_name="en_stem", replace=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
the following [languages](https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html) are currently supported.
|
||||||
|
|
||||||
|
## Index multiple columns
|
||||||
|
|
||||||
|
If you have multiple string columns to index, there's no need to combine them manually -- simply pass them all as a list to `create_fts_index`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
table.create_fts_index(["title", "content"], use_tantivy=True, replace=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that the search API call does not change - you can search over all indexed columns at once.
|
||||||
|
|
||||||
|
## Filtering
|
||||||
|
|
||||||
|
Currently the LanceDB full text search feature supports *post-filtering*, meaning filters are
|
||||||
|
applied on top of the full text search results (see [native FTS](fts.md) if you need pre-filtering). This can be invoked via the familiar
|
||||||
|
`where` syntax:
|
||||||
|
|
||||||
|
```python
|
||||||
|
table.search("puppy").limit(10).where("meta='foo'").to_list()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Sorting
|
||||||
|
|
||||||
|
You can pre-sort the documents by specifying `ordering_field_names` when
|
||||||
|
creating the full-text search index. Once pre-sorted, you can then specify
|
||||||
|
`ordering_field_name` while searching to return results sorted by the given
|
||||||
|
field. For example,
|
||||||
|
|
||||||
|
```python
|
||||||
|
table.create_fts_index(["content"], use_tantivy=True, ordering_field_names=["id"], replace=True)
|
||||||
|
|
||||||
|
(table.search("puppy", ordering_field_name="id")
|
||||||
|
.limit(20)
|
||||||
|
.to_list())
|
||||||
|
```
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
If you wish to specify an ordering field at query time, you must also
|
||||||
|
have specified it during indexing time. Otherwise at query time, an
|
||||||
|
error will be raised that looks like `ValueError: The field does not exist: xxx`
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
The fields to sort on must be of typed unsigned integer, or else you will see
|
||||||
|
an error during indexing that looks like
|
||||||
|
`TypeError: argument 'value': 'float' object cannot be interpreted as an integer`.
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
You can specify multiple fields for ordering at indexing time.
|
||||||
|
But at query time only one ordering field is supported.
|
||||||
|
|
||||||
|
|
||||||
|
## Phrase queries vs. terms queries
|
||||||
|
|
||||||
|
For full-text search you can specify either a **phrase** query like `"the old man and the sea"`,
|
||||||
|
or a **terms** search query like `"(Old AND Man) AND Sea"`. For more details on the terms
|
||||||
|
query syntax, see Tantivy's [query parser rules](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html).
|
||||||
|
|
||||||
|
!!! tip "Note"
|
||||||
|
The query parser will raise an exception on queries that are ambiguous. For example, in the query `they could have been dogs OR cats`, `OR` is capitalized so it's considered a keyword query operator. But it's ambiguous how the left part should be treated. So if you submit this search query as is, you'll get `Syntax Error: they could have been dogs OR cats`.
|
||||||
|
|
||||||
|
```py
|
||||||
|
# This raises a syntax error
|
||||||
|
table.search("they could have been dogs OR cats")
|
||||||
|
```
|
||||||
|
|
||||||
|
On the other hand, lowercasing `OR` to `or` will work, because there are no capitalized logical operators and
|
||||||
|
the query is treated as a phrase query.
|
||||||
|
|
||||||
|
```py
|
||||||
|
# This works!
|
||||||
|
table.search("they could have been dogs or cats")
|
||||||
|
```
|
||||||
|
|
||||||
|
It can be cumbersome to have to remember what will cause a syntax error depending on the type of
|
||||||
|
query you want to perform. To make this simpler, when you want to perform a phrase query, you can
|
||||||
|
enforce it in one of two ways:
|
||||||
|
|
||||||
|
1. Place the double-quoted query inside single quotes. For example, `table.search('"they could have been dogs OR cats"')` is treated as
|
||||||
|
a phrase query.
|
||||||
|
1. Explicitly declare the `phrase_query()` method. This is useful when you have a phrase query that
|
||||||
|
itself contains double quotes. For example, `table.search('the cats OR dogs were not really "pets" at all').phrase_query()`
|
||||||
|
is treated as a phrase query.
|
||||||
|
|
||||||
|
In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested
|
||||||
|
double quotes replaced by single quotes.
|
||||||
|
|
||||||
|
|
||||||
|
## Configurations
|
||||||
|
|
||||||
|
By default, LanceDB configures a 1GB heap size limit for creating the index. You can
|
||||||
|
reduce this if running on a smaller node, or increase this for faster performance while
|
||||||
|
indexing a larger corpus.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# configure a 512MB heap size
|
||||||
|
heap = 1024 * 1024 * 512
|
||||||
|
table.create_fts_index(["title", "content"], use_tantivy=True, writer_heap_size=heap, replace=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Current limitations
|
||||||
|
|
||||||
|
1. Currently we do not yet support incremental writes.
|
||||||
|
If you add data after FTS index creation, it won't be reflected
|
||||||
|
in search results until you do a full reindex.
|
||||||
|
|
||||||
|
2. We currently only support local filesystem paths for the FTS index.
|
||||||
|
This is a tantivy limitation. We've implemented an object store plugin
|
||||||
|
but there's no way in tantivy-py to specify to use it.
|
||||||
@@ -49,7 +49,8 @@ The following pages go deeper into the internal of LanceDB and how to use it.
|
|||||||
* [Working with tables](guides/tables.md): Learn how to work with tables and their associated functions
|
* [Working with tables](guides/tables.md): Learn how to work with tables and their associated functions
|
||||||
* [Indexing](ann_indexes.md): Understand how to create indexes
|
* [Indexing](ann_indexes.md): Understand how to create indexes
|
||||||
* [Vector search](search.md): Learn how to perform vector similarity search
|
* [Vector search](search.md): Learn how to perform vector similarity search
|
||||||
* [Full-text search](fts.md): Learn how to perform full-text search
|
* [Full-text search (native)](fts.md): Learn how to perform full-text search
|
||||||
|
* [Full-text search (tantivy-based)](fts_tantivy.md): Learn how to perform full-text search using Tantivy
|
||||||
* [Managing embeddings](embeddings/index.md): Managing embeddings and the embedding functions API in LanceDB
|
* [Managing embeddings](embeddings/index.md): Managing embeddings and the embedding functions API in LanceDB
|
||||||
* [Ecosystem Integrations](integrations/index.md): Integrate LanceDB with other tools in the data ecosystem
|
* [Ecosystem Integrations](integrations/index.md): Integrate LanceDB with other tools in the data ecosystem
|
||||||
* [Python API Reference](python/python.md): Python OSS and Cloud API references
|
* [Python API Reference](python/python.md): Python OSS and Cloud API references
|
||||||
|
|||||||
25
docs/src/js/interfaces/FtsOptions.md
Normal file
25
docs/src/js/interfaces/FtsOptions.md
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||||
|
|
||||||
|
***
|
||||||
|
|
||||||
|
[@lancedb/lancedb](../globals.md) / FtsOptions
|
||||||
|
|
||||||
|
# Interface: FtsOptions
|
||||||
|
|
||||||
|
Options to create an `FTS` index
|
||||||
|
|
||||||
|
## Properties
|
||||||
|
|
||||||
|
### withPosition?
|
||||||
|
|
||||||
|
> `optional` **withPosition**: `boolean`
|
||||||
|
|
||||||
|
Whether to store the positions of the term in the document.
|
||||||
|
|
||||||
|
If this is true then the index will store the positions of the term in the document.
|
||||||
|
This allows phrase queries to be run. But it also increases the size of the index,
|
||||||
|
and the time to build the index.
|
||||||
|
|
||||||
|
The default value is true.
|
||||||
|
|
||||||
|
***
|
||||||
@@ -3,7 +3,7 @@ numpy
|
|||||||
pandas
|
pandas
|
||||||
pylance
|
pylance
|
||||||
duckdb
|
duckdb
|
||||||
|
tantivy==0.20.1
|
||||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
torch
|
torch
|
||||||
polars>=0.19, <=1.3.0
|
polars>=0.19, <=1.3.0
|
||||||
|
|
||||||
|
|||||||
65
node/package-lock.json
generated
65
node/package-lock.json
generated
@@ -326,6 +326,71 @@
|
|||||||
"@jridgewell/sourcemap-codec": "^1.4.10"
|
"@jridgewell/sourcemap-codec": "^1.4.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||||
|
"version": "0.11.1-beta.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.11.1-beta.1.tgz",
|
||||||
|
"integrity": "sha512-q9jcCbmcz45UHmjgecL6zK82WaqUJsARfniwXXPcnd8ooISVhPkgN+RVKv6edwI9T0PV+xVRYq+LQLlZu5fyxw==",
|
||||||
|
"cpu": [
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"darwin"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||||
|
"version": "0.11.1-beta.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.11.1-beta.1.tgz",
|
||||||
|
"integrity": "sha512-E5tCTS5TaTkssTPa+gdnFxZJ1f60jnSIJXhqufNFZk4s+IMViwR1BPqaqE++WY5c1uBI55ef1862CROKDKX4gg==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"darwin"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||||
|
"version": "0.11.1-beta.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.11.1-beta.1.tgz",
|
||||||
|
"integrity": "sha512-Obohy6TH31Uq+fp6ZisHR7iAsvgVPqBExrycVcIJqrLZnIe88N9OWUwBXkmfMAw/2hNJFwD4tU7+4U2FcBWX4w==",
|
||||||
|
"cpu": [
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||||
|
"version": "0.11.1-beta.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.11.1-beta.1.tgz",
|
||||||
|
"integrity": "sha512-3Meu0dgrzNrnBVVQhxkUSAOhQNmgtKHvOvmrRLUicV+X19hd33udihgxVpZZb9mpXenJ8lZsS+Jq6R0hWqntag==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||||
|
"version": "0.11.1-beta.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.11.1-beta.1.tgz",
|
||||||
|
"integrity": "sha512-BafZ9OJPQXsS7JW0weAl12wC+827AiRjfUrE5tvrYWZah2OwCF2U2g6uJ3x4pxfwEGsv5xcHFqgxlS7ttFkh+Q==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"win32"
|
||||||
|
]
|
||||||
|
},
|
||||||
"node_modules/@neon-rs/cli": {
|
"node_modules/@neon-rs/cli": {
|
||||||
"version": "0.0.160",
|
"version": "0.0.160",
|
||||||
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",
|
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",
|
||||||
|
|||||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.11.0",
|
"version": "0.11.1-beta.1",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.11.0",
|
"version": "0.11.1-beta.1",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ name = "lancedb"
|
|||||||
# version in Cargo.toml
|
# version in Cargo.toml
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"deprecation",
|
"deprecation",
|
||||||
"pylance==0.18.3-beta.2",
|
"pylance==0.19.1",
|
||||||
"requests>=2.31.0",
|
"requests>=2.31.0",
|
||||||
"tqdm>=4.27.0",
|
"tqdm>=4.27.0",
|
||||||
"pydantic>=1.10",
|
"pydantic>=1.10",
|
||||||
|
|||||||
@@ -7,6 +7,27 @@ from ._lancedb import (
|
|||||||
IndexConfig,
|
IndexConfig,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
lang_mapping = {
|
||||||
|
"ar": "Arabic",
|
||||||
|
"da": "Danish",
|
||||||
|
"du": "Dutch",
|
||||||
|
"en": "English",
|
||||||
|
"fi": "Finnish",
|
||||||
|
"fr": "French",
|
||||||
|
"de": "German",
|
||||||
|
"gr": "Greek",
|
||||||
|
"hu": "Hungarian",
|
||||||
|
"it": "Italian",
|
||||||
|
"no": "Norwegian",
|
||||||
|
"pt": "Portuguese",
|
||||||
|
"ro": "Romanian",
|
||||||
|
"ru": "Russian",
|
||||||
|
"es": "Spanish",
|
||||||
|
"sv": "Swedish",
|
||||||
|
"ta": "Tamil",
|
||||||
|
"tr": "Turkish",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class BTree:
|
class BTree:
|
||||||
"""Describes a btree index configuration
|
"""Describes a btree index configuration
|
||||||
@@ -78,7 +99,17 @@ class FTS:
|
|||||||
For example, it works with `title`, `description`, `content`, etc.
|
For example, it works with `title`, `description`, `content`, etc.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, with_position: bool = True):
|
def __init__(
|
||||||
|
self,
|
||||||
|
with_position: bool = True,
|
||||||
|
base_tokenizer: str = "simple",
|
||||||
|
language: str = "English",
|
||||||
|
max_token_length: Optional[int] = 40,
|
||||||
|
lower_case: bool = True,
|
||||||
|
stem: bool = False,
|
||||||
|
remove_stop_words: bool = False,
|
||||||
|
ascii_folding: bool = False,
|
||||||
|
):
|
||||||
self._inner = LanceDbIndex.fts(with_position=with_position)
|
self._inner = LanceDbIndex.fts(with_position=with_position)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -55,6 +55,7 @@ from .util import (
|
|||||||
safe_import_polars,
|
safe_import_polars,
|
||||||
value_to_sql,
|
value_to_sql,
|
||||||
)
|
)
|
||||||
|
from .index import lang_mapping
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import PIL
|
import PIL
|
||||||
@@ -497,10 +498,18 @@ class Table(ABC):
|
|||||||
ordering_field_names: Union[str, List[str]] = None,
|
ordering_field_names: Union[str, List[str]] = None,
|
||||||
*,
|
*,
|
||||||
replace: bool = False,
|
replace: bool = False,
|
||||||
with_position: bool = True,
|
|
||||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||||
tokenizer_name: str = "default",
|
|
||||||
use_tantivy: bool = True,
|
use_tantivy: bool = True,
|
||||||
|
tokenizer_name: Optional[str] = None,
|
||||||
|
with_position: bool = True,
|
||||||
|
# tokenizer configs:
|
||||||
|
base_tokenizer: str = "simple",
|
||||||
|
language: str = "English",
|
||||||
|
max_token_length: Optional[int] = 40,
|
||||||
|
lower_case: bool = True,
|
||||||
|
stem: bool = False,
|
||||||
|
remove_stop_words: bool = False,
|
||||||
|
ascii_folding: bool = False,
|
||||||
):
|
):
|
||||||
"""Create a full-text search index on the table.
|
"""Create a full-text search index on the table.
|
||||||
|
|
||||||
@@ -526,7 +535,6 @@ class Table(ABC):
|
|||||||
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
|
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
|
||||||
language code followed by "_stem". So for english it would be "en_stem".
|
language code followed by "_stem". So for english it would be "en_stem".
|
||||||
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
|
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
|
||||||
only available with use_tantivy=True for now
|
|
||||||
use_tantivy: bool, default True
|
use_tantivy: bool, default True
|
||||||
If True, use the legacy full-text search implementation based on tantivy.
|
If True, use the legacy full-text search implementation based on tantivy.
|
||||||
If False, use the new full-text search implementation based on lance-index.
|
If False, use the new full-text search implementation based on lance-index.
|
||||||
@@ -1341,14 +1349,33 @@ class LanceTable(Table):
|
|||||||
ordering_field_names: Union[str, List[str]] = None,
|
ordering_field_names: Union[str, List[str]] = None,
|
||||||
*,
|
*,
|
||||||
replace: bool = False,
|
replace: bool = False,
|
||||||
with_position: bool = True,
|
|
||||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||||
tokenizer_name: str = "default",
|
|
||||||
use_tantivy: bool = True,
|
use_tantivy: bool = True,
|
||||||
|
tokenizer_name: Optional[str] = None,
|
||||||
|
with_position: bool = True,
|
||||||
|
# tokenizer configs:
|
||||||
|
base_tokenizer: str = "simple",
|
||||||
|
language: str = "English",
|
||||||
|
max_token_length: Optional[int] = 40,
|
||||||
|
lower_case: bool = True,
|
||||||
|
stem: bool = False,
|
||||||
|
remove_stop_words: bool = False,
|
||||||
|
ascii_folding: bool = False,
|
||||||
):
|
):
|
||||||
if not use_tantivy:
|
if not use_tantivy:
|
||||||
if not isinstance(field_names, str):
|
if not isinstance(field_names, str):
|
||||||
raise ValueError("field_names must be a string when use_tantivy=False")
|
raise ValueError("field_names must be a string when use_tantivy=False")
|
||||||
|
tokenizer_configs = {
|
||||||
|
"base_tokenizer": base_tokenizer,
|
||||||
|
"language": language,
|
||||||
|
"max_token_length": max_token_length,
|
||||||
|
"lower_case": lower_case,
|
||||||
|
"stem": stem,
|
||||||
|
"remove_stop_words": remove_stop_words,
|
||||||
|
"ascii_folding": ascii_folding,
|
||||||
|
}
|
||||||
|
if tokenizer_name is not None:
|
||||||
|
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
||||||
# delete the existing legacy index if it exists
|
# delete the existing legacy index if it exists
|
||||||
if replace:
|
if replace:
|
||||||
path, fs, exist = self._get_fts_index_path()
|
path, fs, exist = self._get_fts_index_path()
|
||||||
@@ -1359,6 +1386,7 @@ class LanceTable(Table):
|
|||||||
index_type="INVERTED",
|
index_type="INVERTED",
|
||||||
replace=replace,
|
replace=replace,
|
||||||
with_position=with_position,
|
with_position=with_position,
|
||||||
|
**tokenizer_configs,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -1381,6 +1409,8 @@ class LanceTable(Table):
|
|||||||
"Full-text search is only supported on the local filesystem"
|
"Full-text search is only supported on the local filesystem"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if tokenizer_name is None:
|
||||||
|
tokenizer_name = "default"
|
||||||
index = create_index(
|
index = create_index(
|
||||||
path,
|
path,
|
||||||
field_names,
|
field_names,
|
||||||
@@ -1395,6 +1425,56 @@ class LanceTable(Table):
|
|||||||
writer_heap_size=writer_heap_size,
|
writer_heap_size=writer_heap_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def infer_tokenizer_configs(tokenizer_name: str) -> dict:
|
||||||
|
if tokenizer_name == "default":
|
||||||
|
return {
|
||||||
|
"base_tokenizer": "simple",
|
||||||
|
"language": "English",
|
||||||
|
"max_token_length": 40,
|
||||||
|
"lower_case": True,
|
||||||
|
"stem": False,
|
||||||
|
"remove_stop_words": False,
|
||||||
|
"ascii_folding": False,
|
||||||
|
}
|
||||||
|
elif tokenizer_name == "raw":
|
||||||
|
return {
|
||||||
|
"base_tokenizer": "raw",
|
||||||
|
"language": "English",
|
||||||
|
"max_token_length": None,
|
||||||
|
"lower_case": False,
|
||||||
|
"stem": False,
|
||||||
|
"remove_stop_words": False,
|
||||||
|
"ascii_folding": False,
|
||||||
|
}
|
||||||
|
elif tokenizer_name == "whitespace":
|
||||||
|
return {
|
||||||
|
"base_tokenizer": "whitespace",
|
||||||
|
"language": "English",
|
||||||
|
"max_token_length": None,
|
||||||
|
"lower_case": False,
|
||||||
|
"stem": False,
|
||||||
|
"remove_stop_words": False,
|
||||||
|
"ascii_folding": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
# or it's with language stemming with pattern like "en_stem"
|
||||||
|
if len(tokenizer_name) != 7:
|
||||||
|
raise ValueError(f"Invalid tokenizer name {tokenizer_name}")
|
||||||
|
lang = tokenizer_name[:2]
|
||||||
|
if tokenizer_name[-5:] != "_stem":
|
||||||
|
raise ValueError(f"Invalid tokenizer name {tokenizer_name}")
|
||||||
|
if lang not in lang_mapping:
|
||||||
|
raise ValueError(f"Invalid language code {lang}")
|
||||||
|
return {
|
||||||
|
"base_tokenizer": "simple",
|
||||||
|
"language": lang_mapping[lang],
|
||||||
|
"max_token_length": 40,
|
||||||
|
"lower_case": True,
|
||||||
|
"stem": True,
|
||||||
|
"remove_stop_words": False,
|
||||||
|
"ascii_folding": False,
|
||||||
|
}
|
||||||
|
|
||||||
def add(
|
def add(
|
||||||
self,
|
self,
|
||||||
data: DATA,
|
data: DATA,
|
||||||
|
|||||||
@@ -106,12 +106,41 @@ impl Index {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
pub fn fts(with_position: Option<bool>) -> Self {
|
pub fn fts(
|
||||||
|
with_position: Option<bool>,
|
||||||
|
base_tokenizer: Option<String>,
|
||||||
|
language: Option<String>,
|
||||||
|
max_token_length: Option<usize>,
|
||||||
|
lower_case: Option<bool>,
|
||||||
|
stem: Option<bool>,
|
||||||
|
remove_stop_words: Option<bool>,
|
||||||
|
ascii_folding: Option<bool>,
|
||||||
|
) -> Self {
|
||||||
let mut opts = FtsIndexBuilder::default();
|
let mut opts = FtsIndexBuilder::default();
|
||||||
if let Some(with_position) = with_position {
|
if let Some(with_position) = with_position {
|
||||||
opts = opts.with_position(with_position);
|
opts = opts.with_position(with_position);
|
||||||
}
|
}
|
||||||
|
if let Some(base_tokenizer) = base_tokenizer {
|
||||||
|
opts.tokenizer_configs = opts.tokenizer_configs.base_tokenizer(base_tokenizer);
|
||||||
|
}
|
||||||
|
if let Some(language) = language {
|
||||||
|
opts.tokenizer_configs = opts.tokenizer_configs.language(&language).unwrap();
|
||||||
|
}
|
||||||
|
opts.tokenizer_configs = opts.tokenizer_configs.max_token_length(max_token_length);
|
||||||
|
if let Some(lower_case) = lower_case {
|
||||||
|
opts.tokenizer_configs = opts.tokenizer_configs.lower_case(lower_case);
|
||||||
|
}
|
||||||
|
if let Some(stem) = stem {
|
||||||
|
opts.tokenizer_configs = opts.tokenizer_configs.stem(stem);
|
||||||
|
}
|
||||||
|
if let Some(remove_stop_words) = remove_stop_words {
|
||||||
|
opts.tokenizer_configs = opts.tokenizer_configs.remove_stop_words(remove_stop_words);
|
||||||
|
}
|
||||||
|
if let Some(ascii_folding) = ascii_folding {
|
||||||
|
opts.tokenizer_configs = opts.tokenizer_configs.ascii_folding(ascii_folding);
|
||||||
|
}
|
||||||
Self {
|
Self {
|
||||||
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
[toolchain]
|
[toolchain]
|
||||||
channel = "1.79.0"
|
channel = "1.80.0"
|
||||||
|
|||||||
@@ -54,12 +54,14 @@ pub struct LabelListIndexBuilder {}
|
|||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct FtsIndexBuilder {
|
pub struct FtsIndexBuilder {
|
||||||
pub(crate) with_position: bool,
|
pub(crate) with_position: bool,
|
||||||
|
pub tokenizer_configs: TokenizerConfig,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for FtsIndexBuilder {
|
impl Default for FtsIndexBuilder {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
with_position: true,
|
with_position: true,
|
||||||
|
tokenizer_configs: TokenizerConfig::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -72,4 +74,5 @@ impl FtsIndexBuilder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
use lance_index::scalar::inverted::TokenizerConfig;
|
||||||
pub use lance_index::scalar::FullTextSearchQuery;
|
pub use lance_index::scalar::FullTextSearchQuery;
|
||||||
|
|||||||
@@ -1568,6 +1568,7 @@ impl NativeTable {
|
|||||||
let mut dataset = self.dataset.get_mut().await?;
|
let mut dataset = self.dataset.get_mut().await?;
|
||||||
let fts_params = lance_index::scalar::InvertedIndexParams {
|
let fts_params = lance_index::scalar::InvertedIndexParams {
|
||||||
with_position: fts_opts.with_position,
|
with_position: fts_opts.with_position,
|
||||||
|
tokenizer_config: fts_opts.tokenizer_configs,
|
||||||
};
|
};
|
||||||
dataset
|
dataset
|
||||||
.create_index(
|
.create_index(
|
||||||
@@ -2002,7 +2003,7 @@ impl TableInternal for NativeTable {
|
|||||||
self.dataset
|
self.dataset
|
||||||
.get_mut()
|
.get_mut()
|
||||||
.await?
|
.await?
|
||||||
.add_columns(transforms, read_columns)
|
.add_columns(transforms, read_columns, None)
|
||||||
.await?;
|
.await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user