From d6b505477886b48b4868a56481373e2a29a72fcd Mon Sep 17 00:00:00 2001 From: Akash Saravanan Date: Mon, 30 Sep 2024 13:36:28 -0600 Subject: [PATCH] feat(python): add support for trust_remote_code in hf embeddings (#1712) Resovles #1709. Adds `trust_remote_code` as a parameter to the `TransformersEmbeddingFunction` class with a default of False. Updated relevant documentation with the same. --- .../text_embedding_functions/huggingface_embedding.md | 2 +- python/python/lancedb/embeddings/transformers.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md index 80502b49..eb0dfdea 100644 --- a/docs/src/embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md @@ -1,5 +1,5 @@ # Huggingface embedding models -We offer support for all huggingface models (which can be loaded via [transformers](https://huggingface.co/docs/transformers/en/index) library). The default model is `colbert-ir/colbertv2.0` which also has its own special callout - `registry.get("colbert")` +We offer support for all Hugging Face models (which can be loaded via [transformers](https://huggingface.co/docs/transformers/en/index) library). The default model is `colbert-ir/colbertv2.0` which also has its own special callout - `registry.get("colbert")`. Some Hugging Face models might require custom models defined on the HuggingFace Hub in their own modeling files. You may enable this by setting `trust_remote_code=True`. This option should only be set to True for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine. Example usage - ```python diff --git a/python/python/lancedb/embeddings/transformers.py b/python/python/lancedb/embeddings/transformers.py index f532f7c9..caee75fb 100644 --- a/python/python/lancedb/embeddings/transformers.py +++ b/python/python/lancedb/embeddings/transformers.py @@ -40,6 +40,11 @@ class TransformersEmbeddingFunction(EmbeddingFunction): The device to use for the model. Default is "cpu". show_progress_bar : bool Whether to show a progress bar when loading the model. Default is True. + trust_remote_code : bool + Whether or not to allow for custom models defined on the HuggingFace + Hub in their own modeling files. This option should only be set to True + for repositories you trust and in which you have read the code, as it + will execute code present on the Hub on your local machine. to download package, run : `pip install transformers` @@ -49,6 +54,7 @@ class TransformersEmbeddingFunction(EmbeddingFunction): name: str = "colbert-ir/colbertv2.0" device: str = "cpu" + trust_remote_code: bool = False _tokenizer: Any = PrivateAttr() _model: Any = PrivateAttr() @@ -57,7 +63,9 @@ class TransformersEmbeddingFunction(EmbeddingFunction): self._ndims = None transformers = attempt_import_or_raise("transformers") self._tokenizer = transformers.AutoTokenizer.from_pretrained(self.name) - self._model = transformers.AutoModel.from_pretrained(self.name) + self._model = transformers.AutoModel.from_pretrained( + self.name, trust_remote_code=self.trust_remote_code + ) self._model.to(self.device) if PYDANTIC_VERSION.major < 2: # Pydantic 1.x compat