From 155ec16161d6b7ad9c96ce9f2b05b53938fb2226 Mon Sep 17 00:00:00 2001 From: Prashanth Rao <35005448+prrao87@users.noreply.github.com> Date: Wed, 18 Feb 2026 12:04:39 -0500 Subject: [PATCH] fix: deprecate outdated files for embedding registry (#3037) There are old and outdated files in our embedding registry that can confuse coding agents. This PR deprecates the following files that have newer, more modern methods to generate such embeddings. - Deprecate `embeddings/siglip.py` - Deprecate `embeddings/gte.py` ## Why this change? Per a discussion with @AyushExel, the [embedding registry directory ](https://github.com/lancedb/lancedb/tree/1840aa7edcdbf0c3f7d3a303000c8f75842a2f55/python/python/lancedb/embeddings) in the LanceDB repo has a number of outdated files that need to be deprecated. See https://github.com/lancedb/docs/issues/85 for the docs gaps that identified this. - Add note in `openclip` docs that it can be used for SigLip embeddings, which it now supports - Add note in the `sentence-transformers` page that ALL text embedding models on Hugging Face can be used --- python/python/lancedb/embeddings/gte.py | 10 ++++++++++ python/python/lancedb/embeddings/siglip.py | 8 ++++++++ 2 files changed, 18 insertions(+) diff --git a/python/python/lancedb/embeddings/gte.py b/python/python/lancedb/embeddings/gte.py index b4e7e16e2..4f547a30b 100644 --- a/python/python/lancedb/embeddings/gte.py +++ b/python/python/lancedb/embeddings/gte.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright The LanceDB Authors +import warnings from typing import List, Union import numpy as np @@ -15,6 +16,8 @@ from .utils import weak_lru @register("gte-text") class GteEmbeddings(TextEmbeddingFunction): """ + Deprecated: GTE embeddings should be used through sentence-transformers. + An embedding function that uses GTE-LARGE MLX format(for Apple silicon devices only) as well as the standard cpu/gpu version from: https://huggingface.co/thenlper/gte-large. @@ -61,6 +64,13 @@ class GteEmbeddings(TextEmbeddingFunction): def __init__(self, **kwargs): super().__init__(**kwargs) + warnings.warn( + "GTE embeddings as a standalone embedding function are deprecated. " + "Use the 'sentence-transformers' embedding function with a GTE model " + "instead.", + DeprecationWarning, + stacklevel=3, + ) self._ndims = None if kwargs: self.mlx = kwargs.get("mlx", False) diff --git a/python/python/lancedb/embeddings/siglip.py b/python/python/lancedb/embeddings/siglip.py index 41228bbe0..cd77c1f5d 100644 --- a/python/python/lancedb/embeddings/siglip.py +++ b/python/python/lancedb/embeddings/siglip.py @@ -6,6 +6,7 @@ import io import os from typing import TYPE_CHECKING, List, Union import urllib.parse as urlparse +import warnings import numpy as np import pyarrow as pa @@ -24,6 +25,7 @@ if TYPE_CHECKING: @register("siglip") class SigLipEmbeddings(EmbeddingFunction): + # Deprecated: prefer CLIP embeddings via `open-clip`. model_name: str = "google/siglip-base-patch16-224" device: str = "cpu" batch_size: int = 64 @@ -36,6 +38,12 @@ class SigLipEmbeddings(EmbeddingFunction): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + warnings.warn( + "SigLip embeddings are deprecated. Use CLIP embeddings via the " + "'open-clip' embedding function instead.", + DeprecationWarning, + stacklevel=3, + ) transformers = attempt_import_or_raise("transformers") self._torch = attempt_import_or_raise("torch")