From 4761fa9bcb921332f5e1985aea32035439ad8632 Mon Sep 17 00:00:00 2001 From: Dhruv Garg <136477030+DhruvGarg111@users.noreply.github.com> Date: Fri, 10 Apr 2026 03:58:34 +0530 Subject: [PATCH] fix(python): migrate gemini-text provider to google-genai sdk (#3250) ## Summary - migrate gemini-text embedding provider from deprecated google.generativeai to google.genai - update Python embedding extra dependency to google-genai - update default model name to gemini-embedding-001 - adapt embed calls to Client().models.embed_content(...) - apply lint fixes from CI ## Related - Closes #3191 --- python/pyproject.toml | 2 +- .../python/lancedb/embeddings/gemini_text.py | 66 +++++++++++++------ 2 files changed, 48 insertions(+), 20 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 98dfad32c..280f6cfec 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -83,7 +83,7 @@ embeddings = [ "colpali-engine>=0.3.10", "huggingface_hub>=0.19.0", "InstructorEmbedding>=1.0.1", - "google.generativeai>=0.3.0", + "google-genai>=1.0.0", "boto3>=1.28.57", "awscli>=1.44.38", "botocore>=1.31.57", diff --git a/python/python/lancedb/embeddings/gemini_text.py b/python/python/lancedb/embeddings/gemini_text.py index 9756115c5..a9b10b002 100644 --- a/python/python/lancedb/embeddings/gemini_text.py +++ b/python/python/lancedb/embeddings/gemini_text.py @@ -19,10 +19,10 @@ from .utils import TEXT, api_key_not_found_help @register("gemini-text") class GeminiText(TextEmbeddingFunction): """ - An embedding function that uses the Google's Gemini API. Requires GOOGLE_API_KEY to + An embedding function that uses Google's Gemini API. Requires GOOGLE_API_KEY to be set. - https://ai.google.dev/docs/embeddings_guide + https://ai.google.dev/gemini-api/docs/embeddings Supports various tasks types: | Task Type | Description | @@ -46,9 +46,12 @@ class GeminiText(TextEmbeddingFunction): Parameters ---------- - name: str, default "models/embedding-001" - The name of the model to use. See the Gemini documentation for a list of - available models. + name: str, default "gemini-embedding-001" + The name of the model to use. Supported models include: + - "gemini-embedding-001" (768 dimensions) + + Note: The legacy "models/embedding-001" format is also supported but + "gemini-embedding-001" is recommended. query_task_type: str, default "retrieval_query" Sets the task type for the queries. @@ -77,7 +80,7 @@ class GeminiText(TextEmbeddingFunction): """ - name: str = "models/embedding-001" + name: str = "gemini-embedding-001" query_task_type: str = "retrieval_query" source_task_type: str = "retrieval_document" @@ -114,23 +117,48 @@ class GeminiText(TextEmbeddingFunction): texts: list[str] or np.ndarray (of str) The texts to embed """ - if ( - kwargs.get("task_type") == "retrieval_document" - ): # Provide a title to use existing API design - title = "Embedding of a document" - kwargs["title"] = title + from google.genai import types - return [ - self.client.embed_content(model=self.name, content=text, **kwargs)[ - "embedding" - ] - for text in texts - ] + task_type = kwargs.get("task_type") + + # Build content objects for embed_content + contents = [] + for text in texts: + if task_type == "retrieval_document": + # Provide a title for retrieval_document task + contents.append( + {"parts": [{"text": "Embedding of a document"}, {"text": text}]} + ) + else: + contents.append({"parts": [{"text": text}]}) + + # Build config + config_kwargs = {} + if task_type: + config_kwargs["task_type"] = task_type.upper() # API expects uppercase + + # Call embed_content for each content + embeddings = [] + for content in contents: + config = ( + types.EmbedContentConfig(**config_kwargs) if config_kwargs else None + ) + response = self.client.models.embed_content( + model=self.name, + contents=content, + config=config, + ) + embeddings.append(response.embeddings[0].values) + + return embeddings @cached_property def client(self): - genai = attempt_import_or_raise("google.generativeai", "google.generativeai") + attempt_import_or_raise("google.genai", "google-genai") if not os.environ.get("GOOGLE_API_KEY"): api_key_not_found_help("google") - return genai + + from google import genai as genai_module + + return genai_module.Client(api_key=os.environ.get("GOOGLE_API_KEY"))