diff --git a/docs/src/embeddings/default_embedding_functions.md b/docs/src/embeddings/default_embedding_functions.md index f772208d..67422025 100644 --- a/docs/src/embeddings/default_embedding_functions.md +++ b/docs/src/embeddings/default_embedding_functions.md @@ -177,6 +177,32 @@ Allows you to set parameters when registering a `sentence-transformers` object. Visit sentence-transformers [HuggingFace HUB](https://huggingface.co/sentence-transformers) page for more information on the available models. +### Huggingface embedding models +We offer support for all huggingface models (which can be loaded via [transformers](https://huggingface.co/docs/transformers/en/index) library). The default model is `colbert-ir/colbertv2.0` which also has its own special callout - `registry.get("colbert")` + +Example usage - +```python +import lancedb +import pandas as pd + +from lancedb.embeddings import get_registry +from lancedb.pydantic import LanceModel, Vector + +model = get_registry().get("huggingface").create(name='facebook/bart-base') + +class TextModel(LanceModel): + text: str = model.SourceField() + vector: Vector(model.ndims()) = model.VectorField() + +df = pd.DataFrame({"text": ["hi hello sayonara", "goodbye world"]}) +table = db.create_table("greets", schema=Words) +table.add() +query = "old greeting" +actual = table.search(query).limit(1).to_pydantic(Words)[0] +print(actual.text) +``` + + ### OpenAI embeddings LanceDB registers the OpenAI embeddings function in the registry by default, as `openai`. Below are the parameters that you can customize when creating the instances: diff --git a/python/pyproject.toml b/python/pyproject.toml index d3dcf03c..cbac7d18 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -90,7 +90,7 @@ requires = ["maturin>=1.4"] build-backend = "maturin" [tool.ruff.lint] -select = ["F", "E", "W", "I", "G", "TCH", "PERF"] +select = ["F", "E", "W", "G", "TCH", "PERF"] [tool.pytest.ini_options] addopts = "--strict-markers --ignore-glob=lancedb/embeddings/*.py" diff --git a/python/python/lancedb/embeddings/__init__.py b/python/python/lancedb/embeddings/__init__.py index 0a6f6dfe..1b88dd24 100644 --- a/python/python/lancedb/embeddings/__init__.py +++ b/python/python/lancedb/embeddings/__init__.py @@ -10,7 +10,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - # ruff: noqa: F401 from .base import EmbeddingFunction, EmbeddingFunctionConfig, TextEmbeddingFunction from .bedrock import BedRockText @@ -21,4 +20,7 @@ from .open_clip import OpenClipEmbeddings from .openai import OpenAIEmbeddings from .registry import EmbeddingFunctionRegistry, get_registry from .sentence_transformers import SentenceTransformerEmbeddings +from .gte import GteEmbeddings +from .transformers import TransformersEmbeddingFunction, ColbertEmbeddings +from .imagebind import ImageBindEmbeddings from .utils import with_embeddings diff --git a/python/python/lancedb/embeddings/imagebind.py b/python/python/lancedb/embeddings/imagebind.py index 1179e370..209a134b 100644 --- a/python/python/lancedb/embeddings/imagebind.py +++ b/python/python/lancedb/embeddings/imagebind.py @@ -38,6 +38,9 @@ class ImageBindEmbeddings(EmbeddingFunction): device: str = "cpu" normalize: bool = False + class Config: + keep_untouched = (cached_property,) + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._ndims = 1024 diff --git a/python/python/lancedb/embeddings/transformers.py b/python/python/lancedb/embeddings/transformers.py new file mode 100644 index 00000000..f796bd2e --- /dev/null +++ b/python/python/lancedb/embeddings/transformers.py @@ -0,0 +1,100 @@ +# Copyright (c) 2023. LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import cached_property +from typing import List, Any + +import numpy as np + +from pydantic import PrivateAttr + +from ..util import attempt_import_or_raise +from .base import EmbeddingFunction +from .registry import register +from .utils import TEXT + + +@register("huggingface") +class TransformersEmbeddingFunction(EmbeddingFunction): + """ + An embedding function that can use any model from the transformers library. + + Parameters: + ---------- + name : str + The name of the model to use. This should be a model name that can be loaded + by transformers.AutoModel.from_pretrained. For example, "bert-base-uncased". + default: "colbert-ir/colbertv2.0"" + + to download package, run : + `pip install transformers` + you may need to install pytorch as well - `https://pytorch.org/get-started/locally/` + + """ + + name: str = "colbert-ir/colbertv2.0" + _tokenizer: Any = PrivateAttr() + _model: Any = PrivateAttr() + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._ndims = None + transformers = attempt_import_or_raise("transformers") + self._tokenizer = transformers.AutoTokenizer.from_pretrained(self.name) + self._model = transformers.AutoModel.from_pretrained(self.name) + + class Config: + keep_untouched = (cached_property,) + + def ndims(self): + self._ndims = self._model.config.hidden_size + return self._ndims + + def compute_query_embeddings(self, query: str, *args, **kwargs) -> List[np.array]: + return self.compute_source_embeddings(query) + + def compute_source_embeddings(self, texts: TEXT, *args, **kwargs) -> List[np.array]: + texts = self.sanitize_input(texts) + embedding = [] + for text in texts: + encoding = self._tokenizer( + text, return_tensors="pt", padding=True, truncation=True + ) + emb = self._model(**encoding).last_hidden_state.mean(dim=1).squeeze() + embedding.append(emb.detach().numpy()) + + return embedding + + +@register("colbert") +class ColbertEmbeddings(TransformersEmbeddingFunction): + """ + An embedding function that uses the colbert model from the huggingface library. + + Parameters: + ---------- + name : str + The name of the model to use. This should be a model name that can be loaded + by transformers.AutoModel.from_pretrained. For example, "bert-base-uncased". + default: "colbert-ir/colbertv2.0"" + + to download package, run : + `pip install transformers` + you may need to install pytorch as well - `https://pytorch.org/get-started/locally/` + + """ + + name: str = "colbert-ir/colbertv2.0" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) diff --git a/python/python/tests/test_embeddings_slow.py b/python/python/tests/test_embeddings_slow.py index 9dc85bfc..14139307 100644 --- a/python/python/tests/test_embeddings_slow.py +++ b/python/python/tests/test_embeddings_slow.py @@ -45,7 +45,7 @@ except Exception: @pytest.mark.slow -@pytest.mark.parametrize("alias", ["sentence-transformers", "openai"]) +@pytest.mark.parametrize("alias", ["sentence-transformers", "openai", "huggingface"]) def test_basic_text_embeddings(alias, tmp_path): db = lancedb.connect(tmp_path) registry = get_registry() @@ -84,7 +84,7 @@ def test_basic_text_embeddings(alias, tmp_path): ) ) - query = "greetings" + query = "greeting" actual = ( table.search(query, vector_column_name="vector").limit(1).to_pydantic(Words)[0] ) @@ -184,9 +184,9 @@ def test_imagebind(tmp_path): import shutil import tempfile - import lancedb.embeddings.imagebind import pandas as pd import requests + from lancedb.embeddings import get_registry from lancedb.pydantic import LanceModel, Vector @@ -321,8 +321,6 @@ def test_gemini_embedding(tmp_path): ) @pytest.mark.slow def test_gte_embedding(tmp_path): - import lancedb.embeddings.gte - model = get_registry().get("gte-text").create() class TextModel(LanceModel):