From e6f23a298ffdd239b8935792e9b9b352294e158d Mon Sep 17 00:00:00 2001
From: prrao87 <prrao87@gmail.com>
Date: Tue, 17 Feb 2026 10:33:33 -0500
Subject: [PATCH] Remove outdated files

---
 python/python/lancedb/embeddings/gte.py    | 123 -----------------
 python/python/lancedb/embeddings/siglip.py | 148 ---------------------
 2 files changed, 271 deletions(-)
 delete mode 100644 python/python/lancedb/embeddings/gte.py
 delete mode 100644 python/python/lancedb/embeddings/siglip.py

diff --git a/python/python/lancedb/embeddings/gte.py b/python/python/lancedb/embeddings/gte.py
deleted file mode 100644
index b4e7e16e2..000000000
--- a/python/python/lancedb/embeddings/gte.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright The LanceDB Authors
-
-
-from typing import List, Union
-
-import numpy as np
-
-from ..util import attempt_import_or_raise
-from .base import TextEmbeddingFunction
-from .registry import register
-from .utils import weak_lru
-
-
-@register("gte-text")
-class GteEmbeddings(TextEmbeddingFunction):
-    """
-    An embedding function that uses GTE-LARGE MLX format(for Apple silicon devices only)
-    as well as the standard cpu/gpu version from: https://huggingface.co/thenlper/gte-large.
-
-    For Apple users, you will need the mlx package insalled, which can be done with:
-        pip install mlx
-
-    Parameters
-    ----------
-    name: str, default "thenlper/gte-large"
-        The name of the model to use.
-    device: str, default "cpu"
-        Sets the device type for the model.
-    normalize: str, default "True"
-        Controls normalize param in encode function for the transformer.
-    mlx: bool, default False
-        Controls which model to use. False for gte-large,True for the mlx version.
-
-    Examples
-    --------
-    import lancedb
-    import lancedb.embeddings.gte
-    from lancedb.embeddings import get_registry
-    from lancedb.pydantic import LanceModel, Vector
-    import pandas as pd
-
-    model = get_registry().get("gte-text").create() # mlx=True for Apple silicon
-    class TextModel(LanceModel):
-        text: str = model.SourceField()
-        vector: Vector(model.ndims()) = model.VectorField()
-
-    df = pd.DataFrame({"text": ["hi hello sayonara", "goodbye world"]})
-    db = lancedb.connect("~/.lancedb")
-    tbl = db.create_table("test", schema=TextModel, mode="overwrite")
-
-    tbl.add(df)
-    rs = tbl.search("hello").limit(1).to_pandas()
-
-    """
-
-    name: str = "thenlper/gte-large"
-    device: str = "cpu"
-    normalize: bool = True
-    mlx: bool = False
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self._ndims = None
-        if kwargs:
-            self.mlx = kwargs.get("mlx", False)
-            if self.mlx is True:
-                self.name = "gte-mlx"
-
-    @property
-    def embedding_model(self):
-        """
-        Get the embedding model specified by the flag,
-        name and device. This is cached so that the model is only loaded
-        once per process.
-        """
-        return self.get_embedding_model()
-
-    def ndims(self):
-        if self.mlx is True:
-            self._ndims = self.embedding_model.dims
-        if self._ndims is None:
-            self._ndims = len(self.generate_embeddings("foo")[0])
-        return self._ndims
-
-    def generate_embeddings(
-        self, texts: Union[List[str], np.ndarray]
-    ) -> List[np.array]:
-        """
-        Get the embeddings for the given texts.
-
-        Parameters
-        ----------
-        texts: list[str] or np.ndarray (of str)
-            The texts to embed
-        """
-        if self.mlx is True:
-            return self.embedding_model.run(list(texts)).tolist()
-
-        return self.embedding_model.encode(
-            list(texts),
-            convert_to_numpy=True,
-            normalize_embeddings=self.normalize,
-        ).tolist()
-
-    @weak_lru(maxsize=1)
-    def get_embedding_model(self):
-        """
-        Get the embedding model specified by the flag,
-        name and device. This is cached so that the model is only loaded
-        once per process.
-        """
-        if self.mlx is True:
-            from .gte_mlx_model import Model
-
-            return Model()
-        else:
-            sentence_transformers = attempt_import_or_raise(
-                "sentence_transformers", "sentence-transformers"
-            )
-            return sentence_transformers.SentenceTransformer(
-                self.name, device=self.device
-            )
diff --git a/python/python/lancedb/embeddings/siglip.py b/python/python/lancedb/embeddings/siglip.py
deleted file mode 100644
index 41228bbe0..000000000
--- a/python/python/lancedb/embeddings/siglip.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright The LanceDB Authors
-
-import concurrent.futures
-import io
-import os
-from typing import TYPE_CHECKING, List, Union
-import urllib.parse as urlparse
-
-import numpy as np
-import pyarrow as pa
-from tqdm import tqdm
-from pydantic import PrivateAttr
-
-from ..util import attempt_import_or_raise
-from .base import EmbeddingFunction
-from .registry import register
-from .utils import IMAGES, url_retrieve
-
-if TYPE_CHECKING:
-    import PIL
-    import torch
-
-
-@register("siglip")
-class SigLipEmbeddings(EmbeddingFunction):
-    model_name: str = "google/siglip-base-patch16-224"
-    device: str = "cpu"
-    batch_size: int = 64
-    normalize: bool = True
-
-    _model = PrivateAttr()
-    _processor = PrivateAttr()
-    _tokenizer = PrivateAttr()
-    _torch = PrivateAttr()
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        transformers = attempt_import_or_raise("transformers")
-        self._torch = attempt_import_or_raise("torch")
-
-        self._processor = transformers.AutoProcessor.from_pretrained(self.model_name)
-        self._model = transformers.SiglipModel.from_pretrained(self.model_name)
-        self._model.to(self.device)
-        self._model.eval()
-        self._ndims = None
-
-    def ndims(self):
-        if self._ndims is None:
-            self._ndims = self.generate_text_embeddings("foo").shape[0]
-        return self._ndims
-
-    def compute_query_embeddings(
-        self, query: Union[str, "PIL.Image.Image"], *args, **kwargs
-    ) -> List[np.ndarray]:
-        if isinstance(query, str):
-            return [self.generate_text_embeddings(query)]
-        else:
-            PIL_Image = attempt_import_or_raise("PIL.Image", "pillow")
-            if isinstance(query, PIL_Image.Image):
-                return [self.generate_image_embedding(query)]
-            else:
-                raise TypeError("SigLIP supports str or PIL Image as query")
-
-    def generate_text_embeddings(self, text: str) -> np.ndarray:
-        torch = self._torch
-        text_inputs = self._processor(
-            text=text,
-            return_tensors="pt",
-            padding="max_length",
-            truncation=True,
-            max_length=64,
-        ).to(self.device)
-
-        with torch.no_grad():
-            text_features = self._model.get_text_features(**text_inputs)
-            if self.normalize:
-                text_features = text_features / text_features.norm(dim=-1, keepdim=True)
-            return text_features.cpu().detach().numpy().squeeze()
-
-    def sanitize_input(self, images: IMAGES) -> Union[List[bytes], np.ndarray]:
-        if isinstance(images, (str, bytes)):
-            images = [images]
-        elif isinstance(images, pa.Array):
-            images = images.to_pylist()
-        elif isinstance(images, pa.ChunkedArray):
-            images = images.combine_chunks().to_pylist()
-        return images
-
-    def compute_source_embeddings(
-        self, images: IMAGES, *args, **kwargs
-    ) -> List[np.ndarray]:
-        images = self.sanitize_input(images)
-        embeddings = []
-
-        for i in range(0, len(images), self.batch_size):
-            j = min(i + self.batch_size, len(images))
-            batch = images[i:j]
-            embeddings.extend(self._parallel_get(batch))
-        return embeddings
-
-    def _parallel_get(self, images: Union[List[str], List[bytes]]) -> List[np.ndarray]:
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            futures = [
-                executor.submit(self.generate_image_embedding, image)
-                for image in images
-            ]
-            return [f.result() for f in tqdm(futures, desc="SigLIP Embedding")]
-
-    def generate_image_embedding(
-        self, image: Union[str, bytes, "PIL.Image.Image"]
-    ) -> np.ndarray:
-        image = self._to_pil(image)
-        image = self._processor(images=image, return_tensors="pt")["pixel_values"]
-        return self._encode_and_normalize_image(image)
-
-    def _encode_and_normalize_image(self, image_tensor: "torch.Tensor") -> np.ndarray:
-        torch = self._torch
-        with torch.no_grad():
-            image_features = self._model.get_image_features(
-                image_tensor.to(self.device)
-            )
-            if self.normalize:
-                image_features = image_features / image_features.norm(
-                    dim=-1, keepdim=True
-                )
-            return image_features.cpu().detach().numpy().squeeze()
-
-    def _to_pil(self, image: Union[str, bytes, "PIL.Image.Image"]):
-        PIL_Image = attempt_import_or_raise("PIL.Image", "pillow")
-        if isinstance(image, PIL_Image.Image):
-            return image.convert("RGB") if image.mode != "RGB" else image
-        elif isinstance(image, bytes):
-            return PIL_Image.open(io.BytesIO(image)).convert("RGB")
-        elif isinstance(image, str):
-            parsed = urlparse.urlparse(image)
-            if parsed.scheme == "file":
-                return PIL_Image.open(parsed.path).convert("RGB")
-            elif parsed.scheme == "":
-                path = image if os.name == "nt" else parsed.path
-                return PIL_Image.open(path).convert("RGB")
-            elif parsed.scheme.startswith("http"):
-                image_bytes = url_retrieve(image)
-                return PIL_Image.open(io.BytesIO(image_bytes)).convert("RGB")
-            else:
-                raise NotImplementedError("Only local and http(s) urls are supported")
-        else:
-            raise ValueError(f"Unsupported image type: {type(image)}")