diff --git a/.github/workflows/node.yml b/.github/workflows/node.yml index e9615af7..09b15afa 100644 --- a/.github/workflows/node.yml +++ b/.github/workflows/node.yml @@ -20,6 +20,7 @@ env: # "1" means line tables only, which is useful for panic tracebacks. # # Use native CPU to accelerate tests if possible, especially for f16 + # target-cpu=haswell fixes failing ci build RUSTFLAGS: "-C debuginfo=1 -C target-cpu=haswell -C target-feature=+f16c,+avx2,+fma" RUST_BACKTRACE: "1" diff --git a/docs/src/embeddings/default_embedding_functions.md b/docs/src/embeddings/default_embedding_functions.md index fb1babb2..f772208d 100644 --- a/docs/src/embeddings/default_embedding_functions.md +++ b/docs/src/embeddings/default_embedding_functions.md @@ -19,27 +19,163 @@ Allows you to set parameters when registering a `sentence-transformers` object. | `normalize` | `bool` | `True` | Whether to normalize the input text before feeding it to the model | -```python -db = lancedb.connect("/tmp/db") -registry = EmbeddingFunctionRegistry.get_instance() -func = registry.get("sentence-transformers").create(device="cpu") +??? "Check out available sentence-transformer models here!" + ```markdown + - sentence-transformers/all-MiniLM-L12-v2 + - sentence-transformers/paraphrase-mpnet-base-v2 + - sentence-transformers/gtr-t5-base + - sentence-transformers/LaBSE + - sentence-transformers/all-MiniLM-L6-v2 + - sentence-transformers/bert-base-nli-max-tokens + - sentence-transformers/bert-base-nli-mean-tokens + - sentence-transformers/bert-base-nli-stsb-mean-tokens + - sentence-transformers/bert-base-wikipedia-sections-mean-tokens + - sentence-transformers/bert-large-nli-cls-token + - sentence-transformers/bert-large-nli-max-tokens + - sentence-transformers/bert-large-nli-mean-tokens + - sentence-transformers/bert-large-nli-stsb-mean-tokens + - sentence-transformers/distilbert-base-nli-max-tokens + - sentence-transformers/distilbert-base-nli-mean-tokens + - sentence-transformers/distilbert-base-nli-stsb-mean-tokens + - sentence-transformers/distilroberta-base-msmarco-v1 + - sentence-transformers/distilroberta-base-msmarco-v2 + - sentence-transformers/nli-bert-base-cls-pooling + - sentence-transformers/nli-bert-base-max-pooling + - sentence-transformers/nli-bert-base + - sentence-transformers/nli-bert-large-cls-pooling + - sentence-transformers/nli-bert-large-max-pooling + - sentence-transformers/nli-bert-large + - sentence-transformers/nli-distilbert-base-max-pooling + - sentence-transformers/nli-distilbert-base + - sentence-transformers/nli-roberta-base + - sentence-transformers/nli-roberta-large + - sentence-transformers/roberta-base-nli-mean-tokens + - sentence-transformers/roberta-base-nli-stsb-mean-tokens + - sentence-transformers/roberta-large-nli-mean-tokens + - sentence-transformers/roberta-large-nli-stsb-mean-tokens + - sentence-transformers/stsb-bert-base + - sentence-transformers/stsb-bert-large + - sentence-transformers/stsb-distilbert-base + - sentence-transformers/stsb-roberta-base + - sentence-transformers/stsb-roberta-large + - sentence-transformers/xlm-r-100langs-bert-base-nli-mean-tokens + - sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens + - sentence-transformers/xlm-r-base-en-ko-nli-ststb + - sentence-transformers/xlm-r-bert-base-nli-mean-tokens + - sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens + - sentence-transformers/xlm-r-large-en-ko-nli-ststb + - sentence-transformers/bert-base-nli-cls-token + - sentence-transformers/all-distilroberta-v1 + - sentence-transformers/multi-qa-MiniLM-L6-dot-v1 + - sentence-transformers/multi-qa-distilbert-cos-v1 + - sentence-transformers/multi-qa-distilbert-dot-v1 + - sentence-transformers/multi-qa-mpnet-base-cos-v1 + - sentence-transformers/multi-qa-mpnet-base-dot-v1 + - sentence-transformers/nli-distilroberta-base-v2 + - sentence-transformers/all-MiniLM-L6-v1 + - sentence-transformers/all-mpnet-base-v1 + - sentence-transformers/all-mpnet-base-v2 + - sentence-transformers/all-roberta-large-v1 + - sentence-transformers/allenai-specter + - sentence-transformers/average_word_embeddings_glove.6B.300d + - sentence-transformers/average_word_embeddings_glove.840B.300d + - sentence-transformers/average_word_embeddings_komninos + - sentence-transformers/average_word_embeddings_levy_dependency + - sentence-transformers/clip-ViT-B-32-multilingual-v1 + - sentence-transformers/clip-ViT-B-32 + - sentence-transformers/distilbert-base-nli-stsb-quora-ranking + - sentence-transformers/distilbert-multilingual-nli-stsb-quora-ranking + - sentence-transformers/distilroberta-base-paraphrase-v1 + - sentence-transformers/distiluse-base-multilingual-cased-v1 + - sentence-transformers/distiluse-base-multilingual-cased-v2 + - sentence-transformers/distiluse-base-multilingual-cased + - sentence-transformers/facebook-dpr-ctx_encoder-multiset-base + - sentence-transformers/facebook-dpr-ctx_encoder-single-nq-base + - sentence-transformers/facebook-dpr-question_encoder-multiset-base + - sentence-transformers/facebook-dpr-question_encoder-single-nq-base + - sentence-transformers/gtr-t5-large + - sentence-transformers/gtr-t5-xl + - sentence-transformers/gtr-t5-xxl + - sentence-transformers/msmarco-MiniLM-L-12-v3 + - sentence-transformers/msmarco-MiniLM-L-6-v3 + - sentence-transformers/msmarco-MiniLM-L12-cos-v5 + - sentence-transformers/msmarco-MiniLM-L6-cos-v5 + - sentence-transformers/msmarco-bert-base-dot-v5 + - sentence-transformers/msmarco-bert-co-condensor + - sentence-transformers/msmarco-distilbert-base-dot-prod-v3 + - sentence-transformers/msmarco-distilbert-base-tas-b + - sentence-transformers/msmarco-distilbert-base-v2 + - sentence-transformers/msmarco-distilbert-base-v3 + - sentence-transformers/msmarco-distilbert-base-v4 + - sentence-transformers/msmarco-distilbert-cos-v5 + - sentence-transformers/msmarco-distilbert-dot-v5 + - sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-lng-aligned + - sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-trained-scratch + - sentence-transformers/msmarco-distilroberta-base-v2 + - sentence-transformers/msmarco-roberta-base-ance-firstp + - sentence-transformers/msmarco-roberta-base-v2 + - sentence-transformers/msmarco-roberta-base-v3 + - sentence-transformers/multi-qa-MiniLM-L6-cos-v1 + - sentence-transformers/nli-mpnet-base-v2 + - sentence-transformers/nli-roberta-base-v2 + - sentence-transformers/nq-distilbert-base-v1 + - sentence-transformers/paraphrase-MiniLM-L12-v2 + - sentence-transformers/paraphrase-MiniLM-L3-v2 + - sentence-transformers/paraphrase-MiniLM-L6-v2 + - sentence-transformers/paraphrase-TinyBERT-L6-v2 + - sentence-transformers/paraphrase-albert-base-v2 + - sentence-transformers/paraphrase-albert-small-v2 + - sentence-transformers/paraphrase-distilroberta-base-v1 + - sentence-transformers/paraphrase-distilroberta-base-v2 + - sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 + - sentence-transformers/paraphrase-multilingual-mpnet-base-v2 + - sentence-transformers/paraphrase-xlm-r-multilingual-v1 + - sentence-transformers/quora-distilbert-base + - sentence-transformers/quora-distilbert-multilingual + - sentence-transformers/sentence-t5-base + - sentence-transformers/sentence-t5-large + - sentence-transformers/sentence-t5-xxl + - sentence-transformers/sentence-t5-xl + - sentence-transformers/stsb-distilroberta-base-v2 + - sentence-transformers/stsb-mpnet-base-v2 + - sentence-transformers/stsb-roberta-base-v2 + - sentence-transformers/stsb-xlm-r-multilingual + - sentence-transformers/xlm-r-distilroberta-base-paraphrase-v1 + - sentence-transformers/clip-ViT-L-14 + - sentence-transformers/clip-ViT-B-16 + - sentence-transformers/use-cmlm-multilingual + - sentence-transformers/all-MiniLM-L12-v1 + ``` -class Words(LanceModel): - text: str = func.SourceField() - vector: Vector(func.ndims()) = func.VectorField() +!!! info + You can also load many other model architectures from the library. For example models from sources such as BAAI, nomic, salesforce research, etc. + See this HF hub page for all [supported models](https://huggingface.co/models?library=sentence-transformers). -table = db.create_table("words", schema=Words) -table.add( - [ - {"text": "hello world"} - {"text": "goodbye world"} - ] -) +!!! note "BAAI Embeddings example" + Here is an example that uses BAAI embedding model from the HuggingFace Hub [supported models](https://huggingface.co/models?library=sentence-transformers) + ```python + db = lancedb.connect("/tmp/db") + registry = EmbeddingFunctionRegistry.get_instance() + model = registry.get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu") + + class Words(LanceModel): + text: str = model.SourceField() + vector: Vector(model.ndims()) = model.VectorField() + + table = db.create_table("words", schema=Words) + table.add( + [ + {"text": "hello world"} + {"text": "goodbye world"} + ] + ) + + query = "greetings" + actual = table.search(query).limit(1).to_pydantic(Words)[0] + print(actual.text) + ``` +Visit sentence-transformers [HuggingFace HUB](https://huggingface.co/sentence-transformers) page for more information on the available models. -query = "greetings" -actual = table.search(query).limit(1).to_pydantic(Words)[0] -print(actual.text) -``` ### OpenAI embeddings LanceDB registers the OpenAI embeddings function in the registry by default, as `openai`. Below are the parameters that you can customize when creating the instances: diff --git a/python/python/lancedb/rerankers/cross_encoder.py b/python/python/lancedb/rerankers/cross_encoder.py index 5a066a13..c88b354a 100644 --- a/python/python/lancedb/rerankers/cross_encoder.py +++ b/python/python/lancedb/rerankers/cross_encoder.py @@ -14,7 +14,7 @@ class CrossEncoderReranker(Reranker): Parameters ---------- - model : str, default "cross-encoder/ms-marco-TinyBERT-L-6" + model_name : str, default "cross-encoder/ms-marco-TinyBERT-L-6" The name of the cross encoder model to use. See the sentence transformers documentation for a list of available models. column : str, default "text"