mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 13:29:57 +00:00
Compare commits
11 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8bcdc81fd3 | ||
|
|
39e14c70c5 | ||
|
|
af8263af94 | ||
|
|
be4ab9eef3 | ||
|
|
184d2bc969 | ||
|
|
ff6f005336 | ||
|
|
49333e522c | ||
|
|
4568df422d | ||
|
|
986891db98 | ||
|
|
036bf02901 | ||
|
|
4e31f0cc7a |
@@ -1,5 +1,5 @@
|
||||
[bumpversion]
|
||||
current_version = 0.4.2
|
||||
current_version = 0.4.3
|
||||
commit = True
|
||||
message = Bump version: {current_version} → {new_version}
|
||||
tag = True
|
||||
|
||||
@@ -67,7 +67,7 @@ We'll cover the basics of using LanceDB on your local machine in this section.
|
||||
!!! warning
|
||||
|
||||
If the table already exists, LanceDB will raise an error by default.
|
||||
If you want to overwrite the table, you can pass in `mode="overwrite"`
|
||||
If you want to make sure you overwrite the table, pass in `mode="overwrite"`
|
||||
to the `createTable` function.
|
||||
|
||||
=== "Javascript"
|
||||
|
||||
@@ -118,6 +118,42 @@ texts = [{"text": "Capitalism has been dominant in the Western world since the e
|
||||
tbl.add(texts)
|
||||
```
|
||||
|
||||
## Gemini Embedding Function
|
||||
With Google's Gemini, you can represent text (words, sentences, and blocks of text) in a vectorized form, making it easier to compare and contrast embeddings. For example, two texts that share a similar subject matter or sentiment should have similar embeddings, which can be identified through mathematical comparison techniques such as cosine similarity. For more on how and why you should use embeddings, refer to the Embeddings guide.
|
||||
The Gemini Embedding Model API supports various task types:
|
||||
|
||||
| Task Type | Description |
|
||||
|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| "`retrieval_query`" | Specifies the given text is a query in a search/retrieval setting. |
|
||||
| "`retrieval_document`" | Specifies the given text is a document in a search/retrieval setting. Using this task type requires a title but is automatically proided by Embeddings API |
|
||||
| "`semantic_similarity`" | Specifies the given text will be used for Semantic Textual Similarity (STS). |
|
||||
| "`classification`" | Specifies that the embeddings will be used for classification. |
|
||||
| "`clusering`" | Specifies that the embeddings will be used for clustering. |
|
||||
|
||||
|
||||
Usage Example:
|
||||
|
||||
```python
|
||||
import lancedb
|
||||
import pandas as pd
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from lancedb.embeddings import get_registry
|
||||
|
||||
|
||||
model = get_registry().get("gemini-text").create()
|
||||
|
||||
class TextModel(LanceModel):
|
||||
text: str = model.SourceField()
|
||||
vector: Vector(model.ndims()) = model.VectorField()
|
||||
|
||||
df = pd.DataFrame({"text": ["hello world", "goodbye world"]})
|
||||
db = lancedb.connect("~/.lancedb")
|
||||
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
|
||||
|
||||
tbl.add(df)
|
||||
rs = tbl.search("hello").limit(1).to_pandas()
|
||||
```
|
||||
|
||||
## Multi-modal embedding functions
|
||||
Multi-modal embedding functions allow you to query your table using both images and text.
|
||||
|
||||
|
||||
@@ -31,13 +31,23 @@ This guide will show how to create tables, insert data into them, and update the
|
||||
```
|
||||
|
||||
!!! info "Note"
|
||||
If the table already exists, LanceDB will raise an error by default. If you want to overwrite the table, you can pass in mode="overwrite" to the createTable function.
|
||||
If the table already exists, LanceDB will raise an error by default.
|
||||
|
||||
`create_table` supports an optional `exist_ok` parameter. When set to True
|
||||
and the table exists, then it simply opens the existing table. The data you
|
||||
passed in will NOT be appended to the table in that case.
|
||||
|
||||
```python
|
||||
db.create_table("name", data, exist_ok=True)
|
||||
```
|
||||
|
||||
Sometimes you want to make sure that you start fresh. If you want to
|
||||
overwrite the table, you can pass in mode="overwrite" to the createTable function.
|
||||
|
||||
```python
|
||||
db.create_table("name", data, mode="overwrite")
|
||||
```
|
||||
|
||||
|
||||
### From pandas DataFrame
|
||||
|
||||
```python
|
||||
|
||||
74
node/package-lock.json
generated
74
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.4.2",
|
||||
"version": "0.4.3",
|
||||
"lockfileVersion": 2,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "vectordb",
|
||||
"version": "0.4.2",
|
||||
"version": "0.4.3",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -53,11 +53,11 @@
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.4.2",
|
||||
"@lancedb/vectordb-darwin-x64": "0.4.2",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.4.2",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.4.2",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.4.2"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.4.3",
|
||||
"@lancedb/vectordb-darwin-x64": "0.4.3",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.4.3",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.4.3",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.4.3"
|
||||
}
|
||||
},
|
||||
"node_modules/@75lb/deep-merge": {
|
||||
@@ -337,9 +337,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.4.2",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.2.tgz",
|
||||
"integrity": "sha512-Ec73W2IHnZK4VC8g/7JyLbgcwcpNb9YI20yEhfTjEEFjJKoElZhDD/ZgghC3QQSRnrXFTxDzPK1V9BDT5QB2Hg==",
|
||||
"version": "0.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.3.tgz",
|
||||
"integrity": "sha512-47CvvSaV1EdUsFEpXUJApTk+hMzAhCxVizipCFUlXCgcmzpCDL86wNgJij/X9a+j6zADhIX//Lsu0qd/an/Bpw==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -349,9 +349,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.4.2",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.2.tgz",
|
||||
"integrity": "sha512-tj0JJlOfOdeSAfmM7EZhrhFdCFjoq9Bmrjt4741BNjtF+Nv4Otl53lFtUQrexTr4oh/E1yY1qaydJ7K++8u3UA==",
|
||||
"version": "0.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.3.tgz",
|
||||
"integrity": "sha512-UlZZv8CmJIuRJNJG+Y1VmFsGyPR8W/72Q5EwgMMsSES6zpMQ9pNdBDWhL3UGX6nMRgnbprkwYiWJ3xHhJvtqtw==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -361,9 +361,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.4.2",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.2.tgz",
|
||||
"integrity": "sha512-OQ7ra5Q5RrLLwxIyI338KfQ2sSl8NJfqAHWvwiMtjCYFFYxIJGjX7U0I2MjSEPqJ5/ZoyjV4mjsvs0G1q20u+Q==",
|
||||
"version": "0.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.3.tgz",
|
||||
"integrity": "sha512-L6NVJr/lKEd8+904FzZNpT8BGQMs2cHNYbGJMIaVvGnMiIJgKAFKtOyGtdDjoe1xRZoEw21yjRGksGbnRO5wHQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -373,9 +373,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.4.2",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.2.tgz",
|
||||
"integrity": "sha512-9tgIFSOYqNJzonnYsQr7v2gGdJm8aZ62UsVX2SWAIVhypoP4A05tAlbzjBgKO3R5xy5gpcW8tt/Pt8IsYWON7Q==",
|
||||
"version": "0.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.3.tgz",
|
||||
"integrity": "sha512-OBx3WF3pK0xNfFJeErmuD9R2QWLa3XdeZspyTsIrQmBDeKj3HKh8y7Scpx4NH5Y09+9JNqRRKRZN7OqWTYhITg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -385,9 +385,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.4.2",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.2.tgz",
|
||||
"integrity": "sha512-jhG3MqZ3r8BexXANLRNX57RAnCZT9psdSBORG3KTu5qe2xaunRlJNSA2kk8a79tf+gtUT/BAmMiXMzAi/dwq8w==",
|
||||
"version": "0.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.3.tgz",
|
||||
"integrity": "sha512-n9IvR81NXZKnSN91mrgeXbEyCiGM+YLJpOgbdHoEtMP04VDnS+iSU4jGOtQBKErvWeCJQaGFQ9qzdcVchpRGyw==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -4840,33 +4840,33 @@
|
||||
}
|
||||
},
|
||||
"@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.4.2",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.2.tgz",
|
||||
"integrity": "sha512-Ec73W2IHnZK4VC8g/7JyLbgcwcpNb9YI20yEhfTjEEFjJKoElZhDD/ZgghC3QQSRnrXFTxDzPK1V9BDT5QB2Hg==",
|
||||
"version": "0.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.3.tgz",
|
||||
"integrity": "sha512-47CvvSaV1EdUsFEpXUJApTk+hMzAhCxVizipCFUlXCgcmzpCDL86wNgJij/X9a+j6zADhIX//Lsu0qd/an/Bpw==",
|
||||
"optional": true
|
||||
},
|
||||
"@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.4.2",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.2.tgz",
|
||||
"integrity": "sha512-tj0JJlOfOdeSAfmM7EZhrhFdCFjoq9Bmrjt4741BNjtF+Nv4Otl53lFtUQrexTr4oh/E1yY1qaydJ7K++8u3UA==",
|
||||
"version": "0.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.3.tgz",
|
||||
"integrity": "sha512-UlZZv8CmJIuRJNJG+Y1VmFsGyPR8W/72Q5EwgMMsSES6zpMQ9pNdBDWhL3UGX6nMRgnbprkwYiWJ3xHhJvtqtw==",
|
||||
"optional": true
|
||||
},
|
||||
"@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.4.2",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.2.tgz",
|
||||
"integrity": "sha512-OQ7ra5Q5RrLLwxIyI338KfQ2sSl8NJfqAHWvwiMtjCYFFYxIJGjX7U0I2MjSEPqJ5/ZoyjV4mjsvs0G1q20u+Q==",
|
||||
"version": "0.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.3.tgz",
|
||||
"integrity": "sha512-L6NVJr/lKEd8+904FzZNpT8BGQMs2cHNYbGJMIaVvGnMiIJgKAFKtOyGtdDjoe1xRZoEw21yjRGksGbnRO5wHQ==",
|
||||
"optional": true
|
||||
},
|
||||
"@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.4.2",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.2.tgz",
|
||||
"integrity": "sha512-9tgIFSOYqNJzonnYsQr7v2gGdJm8aZ62UsVX2SWAIVhypoP4A05tAlbzjBgKO3R5xy5gpcW8tt/Pt8IsYWON7Q==",
|
||||
"version": "0.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.3.tgz",
|
||||
"integrity": "sha512-OBx3WF3pK0xNfFJeErmuD9R2QWLa3XdeZspyTsIrQmBDeKj3HKh8y7Scpx4NH5Y09+9JNqRRKRZN7OqWTYhITg==",
|
||||
"optional": true
|
||||
},
|
||||
"@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.4.2",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.2.tgz",
|
||||
"integrity": "sha512-jhG3MqZ3r8BexXANLRNX57RAnCZT9psdSBORG3KTu5qe2xaunRlJNSA2kk8a79tf+gtUT/BAmMiXMzAi/dwq8w==",
|
||||
"version": "0.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.3.tgz",
|
||||
"integrity": "sha512-n9IvR81NXZKnSN91mrgeXbEyCiGM+YLJpOgbdHoEtMP04VDnS+iSU4jGOtQBKErvWeCJQaGFQ9qzdcVchpRGyw==",
|
||||
"optional": true
|
||||
},
|
||||
"@neon-rs/cli": {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.4.2",
|
||||
"version": "0.4.3",
|
||||
"description": " Serverless, low-latency vector database for AI applications",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
@@ -81,10 +81,10 @@
|
||||
}
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.4.2",
|
||||
"@lancedb/vectordb-darwin-x64": "0.4.2",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.4.2",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.4.2",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.4.2"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.4.3",
|
||||
"@lancedb/vectordb-darwin-x64": "0.4.3",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.4.3",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.4.3",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.4.3"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[bumpversion]
|
||||
current_version = 0.4.4
|
||||
current_version = 0.5.0
|
||||
commit = True
|
||||
message = [python] Bump version: {current_version} → {new_version}
|
||||
tag = True
|
||||
|
||||
@@ -56,6 +56,7 @@ class DBConnection(EnforceOverrides):
|
||||
data: Optional[DATA] = None,
|
||||
schema: Optional[Union[pa.Schema, LanceModel]] = None,
|
||||
mode: str = "create",
|
||||
exist_ok: bool = False,
|
||||
on_bad_vectors: str = "error",
|
||||
fill_value: float = 0.0,
|
||||
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||
@@ -86,6 +87,11 @@ class DBConnection(EnforceOverrides):
|
||||
Can be either "create" or "overwrite".
|
||||
By default, if the table already exists, an exception is raised.
|
||||
If you want to overwrite the table, use mode="overwrite".
|
||||
exist_ok: bool, default False
|
||||
If a table by the same name already exists, then raise an exception
|
||||
if exist_ok=False. If exist_ok=True, then open the existing table;
|
||||
it will not add the provided data but will validate against any
|
||||
schema that's specified.
|
||||
on_bad_vectors: str, default "error"
|
||||
What to do if any of the vectors are not the same size or contains NaNs.
|
||||
One of "error", "drop", "fill".
|
||||
@@ -319,6 +325,7 @@ class LanceDBConnection(DBConnection):
|
||||
data: Optional[DATA] = None,
|
||||
schema: Optional[Union[pa.Schema, LanceModel]] = None,
|
||||
mode: str = "create",
|
||||
exist_ok: bool = False,
|
||||
on_bad_vectors: str = "error",
|
||||
fill_value: float = 0.0,
|
||||
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||
@@ -338,6 +345,7 @@ class LanceDBConnection(DBConnection):
|
||||
data,
|
||||
schema,
|
||||
mode=mode,
|
||||
exist_ok=exist_ok,
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
fill_value=fill_value,
|
||||
embedding_functions=embedding_functions,
|
||||
|
||||
@@ -19,4 +19,5 @@ from .open_clip import OpenClipEmbeddings
|
||||
from .openai import OpenAIEmbeddings
|
||||
from .registry import EmbeddingFunctionRegistry, get_registry
|
||||
from .sentence_transformers import SentenceTransformerEmbeddings
|
||||
from .gemini_text import GeminiText
|
||||
from .utils import with_embeddings
|
||||
|
||||
131
python/lancedb/embeddings/gemini_text.py
Normal file
131
python/lancedb/embeddings/gemini_text.py
Normal file
@@ -0,0 +1,131 @@
|
||||
# Copyright (c) 2023. LanceDB Developers
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
from functools import cached_property
|
||||
from typing import List, Union, Any
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .base import TextEmbeddingFunction
|
||||
from .registry import register
|
||||
from .utils import api_key_not_found_help, TEXT
|
||||
from lancedb.pydantic import PYDANTIC_VERSION
|
||||
|
||||
|
||||
@register("gemini-text")
|
||||
class GeminiText(TextEmbeddingFunction):
|
||||
"""
|
||||
An embedding function that uses the Google's Gemini API. Requires GOOGLE_API_KEY to be set.
|
||||
|
||||
https://ai.google.dev/docs/embeddings_guide
|
||||
|
||||
Supports various tasks types:
|
||||
| Task Type | Description |
|
||||
|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| "`retrieval_query`" | Specifies the given text is a query in a search/retrieval setting. |
|
||||
| "`retrieval_document`" | Specifies the given text is a document in a search/retrieval setting. Using this task type requires a title but is automatically proided by Embeddings API |
|
||||
| "`semantic_similarity`" | Specifies the given text will be used for Semantic Textual Similarity (STS). |
|
||||
| "`classification`" | Specifies that the embeddings will be used for classification. |
|
||||
| "`clusering`" | Specifies that the embeddings will be used for clustering. |
|
||||
|
||||
|
||||
Note: The supported task types might change in the Gemini API, but as long as a supported task type and its argument set is provided,
|
||||
those will be delegated to the API calls.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str, default "models/embedding-001"
|
||||
The name of the model to use. See the Gemini documentation for a list of available models.
|
||||
|
||||
query_task_type: str, default "retrieval_query"
|
||||
Sets the task type for the queries.
|
||||
source_task_type: str, default "retrieval_document"
|
||||
Sets the task type for ingestion.
|
||||
|
||||
Examples
|
||||
--------
|
||||
import lancedb
|
||||
import pandas as pd
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from lancedb.embeddings import get_registry
|
||||
|
||||
model = get_registry().get("gemini-text").create()
|
||||
|
||||
class TextModel(LanceModel):
|
||||
text: str = model.SourceField()
|
||||
vector: Vector(model.ndims()) = model.VectorField()
|
||||
|
||||
df = pd.DataFrame({"text": ["hello world", "goodbye world"]})
|
||||
db = lancedb.connect("~/.lancedb")
|
||||
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
|
||||
|
||||
tbl.add(df)
|
||||
rs = tbl.search("hello").limit(1).to_pandas()
|
||||
|
||||
"""
|
||||
|
||||
name: str = "models/embedding-001"
|
||||
query_task_type: str = "retrieval_query"
|
||||
source_task_type: str = "retrieval_document"
|
||||
|
||||
if PYDANTIC_VERSION < (2, 0): # Pydantic 1.x compat
|
||||
|
||||
class Config:
|
||||
keep_untouched = (cached_property,)
|
||||
|
||||
def ndims(self):
|
||||
# TODO: fix hardcoding
|
||||
return 768
|
||||
|
||||
def compute_query_embeddings(self, query: str, *args, **kwargs) -> List[np.array]:
|
||||
return self.compute_source_embeddings(query, task_type=self.query_task_type)
|
||||
|
||||
def compute_source_embeddings(self, texts: TEXT, *args, **kwargs) -> List[np.array]:
|
||||
texts = self.sanitize_input(texts)
|
||||
task_type = (
|
||||
kwargs.get("task_type") or self.source_task_type
|
||||
) # assume source task type if not passed by `compute_query_embeddings`
|
||||
return self.generate_embeddings(texts, task_type=task_type)
|
||||
|
||||
def generate_embeddings(
|
||||
self, texts: Union[List[str], np.ndarray], *args, **kwargs
|
||||
) -> List[np.array]:
|
||||
"""
|
||||
Get the embeddings for the given texts
|
||||
|
||||
Parameters
|
||||
----------
|
||||
texts: list[str] or np.ndarray (of str)
|
||||
The texts to embed
|
||||
"""
|
||||
if (
|
||||
kwargs.get("task_type") == "retrieval_document"
|
||||
): # Provide a title to use existing API design
|
||||
title = "Embedding of a document"
|
||||
kwargs["title"] = title
|
||||
|
||||
return [
|
||||
self.client.embed_content(model=self.name, content=text, **kwargs)[
|
||||
"embedding"
|
||||
]
|
||||
for text in texts
|
||||
]
|
||||
|
||||
@cached_property
|
||||
def client(self):
|
||||
genai = self.safe_import("google.generativeai", "google.generativeai")
|
||||
|
||||
if not os.environ.get("GOOGLE_API_KEY"):
|
||||
api_key_not_found_help("google")
|
||||
return genai
|
||||
@@ -10,6 +10,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
from functools import cached_property
|
||||
from typing import List, Union
|
||||
|
||||
@@ -17,6 +18,7 @@ import numpy as np
|
||||
|
||||
from .base import TextEmbeddingFunction
|
||||
from .registry import register
|
||||
from .utils import api_key_not_found_help
|
||||
|
||||
|
||||
@register("openai")
|
||||
@@ -51,4 +53,7 @@ class OpenAIEmbeddings(TextEmbeddingFunction):
|
||||
@cached_property
|
||||
def _openai_client(self):
|
||||
openai = self.safe_import("openai")
|
||||
|
||||
if not os.environ.get("OPENAI_API_KEY"):
|
||||
api_key_not_found_help("openai")
|
||||
return openai.OpenAI()
|
||||
|
||||
@@ -216,7 +216,6 @@ def retry_with_exponential_backoff(
|
||||
exponential_base: float = 2,
|
||||
jitter: bool = True,
|
||||
max_retries: int = 7,
|
||||
# errors: tuple = (),
|
||||
):
|
||||
"""Retry a function with exponential backoff.
|
||||
|
||||
@@ -226,7 +225,6 @@ def retry_with_exponential_backoff(
|
||||
exponential_base (float): The base for exponential backoff (default is 2).
|
||||
jitter (bool): Whether to add jitter to the delay (default is True).
|
||||
max_retries (int): Maximum number of retries (default is 10).
|
||||
errors (tuple): Tuple of specific exceptions to retry on (default is (openai.error.RateLimitError,)).
|
||||
|
||||
Returns:
|
||||
function: The decorated function.
|
||||
|
||||
@@ -260,6 +260,17 @@ class LanceQueryBuilder(ABC):
|
||||
for row in self.to_arrow().to_pylist()
|
||||
]
|
||||
|
||||
def to_polars(self) -> "pl.DataFrame":
|
||||
"""
|
||||
Execute the query and return the results as a Polars DataFrame.
|
||||
In addition to the selected columns, LanceDB also returns a vector
|
||||
and also the "_distance" column which is the distance between the query
|
||||
vector and the returned vector.
|
||||
"""
|
||||
import polars as pl
|
||||
|
||||
return pl.from_arrow(self.to_arrow())
|
||||
|
||||
def limit(self, limit: Union[int, None]) -> LanceQueryBuilder:
|
||||
"""Set the maximum number of results to return.
|
||||
|
||||
|
||||
@@ -31,7 +31,13 @@ from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||
from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
|
||||
from .pydantic import LanceModel, model_to_dict
|
||||
from .query import LanceQueryBuilder, Query
|
||||
from .util import fs_from_uri, safe_import_pandas, value_to_sql, join_uri
|
||||
from .util import (
|
||||
fs_from_uri,
|
||||
safe_import_pandas,
|
||||
safe_import_polars,
|
||||
value_to_sql,
|
||||
join_uri,
|
||||
)
|
||||
from .utils.events import register_event
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -41,6 +47,7 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
pd = safe_import_pandas()
|
||||
pl = safe_import_polars()
|
||||
|
||||
|
||||
def _sanitize_data(
|
||||
@@ -66,6 +73,8 @@ def _sanitize_data(
|
||||
meta = data.schema.metadata if data.schema.metadata is not None else {}
|
||||
meta = {k: v for k, v in meta.items() if k != b"pandas"}
|
||||
data = data.replace_schema_metadata(meta)
|
||||
elif pl is not None and isinstance(data, pl.DataFrame):
|
||||
data = data.to_arrow()
|
||||
|
||||
if isinstance(data, pa.Table):
|
||||
if metadata:
|
||||
@@ -688,6 +697,30 @@ class LanceTable(Table):
|
||||
pa.Table"""
|
||||
return self._dataset.to_table()
|
||||
|
||||
def to_polars(self, batch_size=None) -> "pl.LazyFrame":
|
||||
"""Return the table as a polars LazyFrame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
batch_size: int, optional
|
||||
Passed to polars. This is the maximum row count for
|
||||
scanned pyarrow record batches
|
||||
|
||||
Note
|
||||
----
|
||||
1. This requires polars to be installed separately
|
||||
2. Currently we've disabled push-down of the filters from polars
|
||||
because polars pushdown into pyarrow uses pyarrow compute
|
||||
expressions rather than SQl strings (which LanceDB supports)
|
||||
|
||||
Returns
|
||||
-------
|
||||
pl.LazyFrame
|
||||
"""
|
||||
return pl.scan_pyarrow_dataset(
|
||||
self.to_lance(), allow_pyarrow_filter=False, batch_size=batch_size
|
||||
)
|
||||
|
||||
@property
|
||||
def _dataset_uri(self) -> str:
|
||||
return join_uri(self._conn.uri, f"{self.name}.lance")
|
||||
@@ -963,6 +996,7 @@ class LanceTable(Table):
|
||||
data=None,
|
||||
schema=None,
|
||||
mode="create",
|
||||
exist_ok=False,
|
||||
on_bad_vectors: str = "error",
|
||||
fill_value: float = 0.0,
|
||||
embedding_functions: List[EmbeddingFunctionConfig] = None,
|
||||
@@ -1002,6 +1036,10 @@ class LanceTable(Table):
|
||||
mode: str, default "create"
|
||||
The mode to use when writing the data. Valid values are
|
||||
"create", "overwrite", and "append".
|
||||
exist_ok: bool, default False
|
||||
If the table already exists then raise an error if False,
|
||||
otherwise just open the table, it will not add the provided
|
||||
data but will validate against any schema that's specified.
|
||||
on_bad_vectors: str, default "error"
|
||||
What to do if any of the vectors are not the same size or contains NaNs.
|
||||
One of "error", "drop", "fill".
|
||||
@@ -1052,14 +1090,24 @@ class LanceTable(Table):
|
||||
schema = schema.with_metadata(metadata)
|
||||
|
||||
empty = pa.Table.from_pylist([], schema=schema)
|
||||
try:
|
||||
lance.write_dataset(empty, tbl._dataset_uri, schema=schema, mode=mode)
|
||||
table = LanceTable(db, name)
|
||||
except OSError as err:
|
||||
if "Dataset already exists" in str(err) and exist_ok:
|
||||
if tbl.schema != schema:
|
||||
raise ValueError(
|
||||
f"Table {name} already exists with a different schema"
|
||||
)
|
||||
return tbl
|
||||
raise
|
||||
|
||||
new_table = LanceTable(db, name)
|
||||
|
||||
if data is not None:
|
||||
table.add(data)
|
||||
new_table.add(data)
|
||||
|
||||
register_event("create_table")
|
||||
return table
|
||||
return new_table
|
||||
|
||||
@classmethod
|
||||
def open(cls, db, name):
|
||||
@@ -1276,7 +1324,8 @@ def _sanitize_vector_column(
|
||||
"""
|
||||
# ChunkedArray is annoying to work with, so we combine chunks here
|
||||
vec_arr = data[vector_column_name].combine_chunks()
|
||||
if pa.types.is_list(data[vector_column_name].type):
|
||||
typ = data[vector_column_name].type
|
||||
if pa.types.is_list(typ) or pa.types.is_large_list(typ):
|
||||
# if it's a variable size list array,
|
||||
# we make sure the dimensions are all the same
|
||||
has_jagged_ndims = len(vec_arr.values) % len(data) != 0
|
||||
|
||||
@@ -123,6 +123,15 @@ def safe_import_pandas():
|
||||
return None
|
||||
|
||||
|
||||
def safe_import_polars():
|
||||
try:
|
||||
import polars as pl
|
||||
|
||||
return pl
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
|
||||
@singledispatch
|
||||
def value_to_sql(value):
|
||||
raise NotImplementedError("SQL conversion is not implemented for this type")
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "lancedb"
|
||||
version = "0.4.4"
|
||||
version = "0.5.0"
|
||||
dependencies = [
|
||||
"deprecation",
|
||||
"pylance==0.9.6",
|
||||
@@ -48,7 +48,7 @@ classifiers = [
|
||||
repository = "https://github.com/lancedb/lancedb"
|
||||
|
||||
[project.optional-dependencies]
|
||||
tests = ["aiohttp", "pandas>=1.4", "pytest", "pytest-mock", "pytest-asyncio", "duckdb", "pytz"]
|
||||
tests = ["aiohttp", "pandas>=1.4", "pytest", "pytest-mock", "pytest-asyncio", "duckdb", "pytz", "polars"]
|
||||
dev = ["ruff", "pre-commit"]
|
||||
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
|
||||
clip = ["torch", "pillow", "open-clip"]
|
||||
|
||||
@@ -190,6 +190,48 @@ def test_create_mode(tmp_path):
|
||||
assert tbl.to_pandas().item.tolist() == ["fizz", "buzz"]
|
||||
|
||||
|
||||
def test_create_exist_ok(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
data = pd.DataFrame(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0],
|
||||
}
|
||||
)
|
||||
tbl = db.create_table("test", data=data)
|
||||
|
||||
with pytest.raises(OSError):
|
||||
db.create_table("test", data=data)
|
||||
|
||||
# open the table but don't add more rows
|
||||
tbl2 = db.create_table("test", data=data, exist_ok=True)
|
||||
assert tbl.name == tbl2.name
|
||||
assert tbl.schema == tbl2.schema
|
||||
assert len(tbl) == len(tbl2)
|
||||
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), list_size=2)),
|
||||
pa.field("item", pa.utf8()),
|
||||
pa.field("price", pa.float64()),
|
||||
]
|
||||
)
|
||||
tbl3 = db.create_table("test", schema=schema, exist_ok=True)
|
||||
assert tbl3.schema == schema
|
||||
|
||||
bad_schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), list_size=2)),
|
||||
pa.field("item", pa.utf8()),
|
||||
pa.field("price", pa.float64()),
|
||||
pa.field("extra", pa.float32()),
|
||||
]
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
db.create_table("test", schema=bad_schema, exist_ok=True)
|
||||
|
||||
|
||||
def test_delete_table(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
data = pd.DataFrame(
|
||||
|
||||
@@ -89,7 +89,7 @@ def test_openclip(tmp_path):
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
registry = get_registry()
|
||||
func = registry.get("open-clip").create()
|
||||
func = registry.get("open-clip").create(max_retries=0)
|
||||
|
||||
class Images(LanceModel):
|
||||
label: str
|
||||
@@ -170,7 +170,7 @@ def test_cohere_embedding_function():
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_instructor_embedding(tmp_path):
|
||||
model = get_registry().get("instructor").create()
|
||||
model = get_registry().get("instructor").create(max_retries=0)
|
||||
|
||||
class TextModel(LanceModel):
|
||||
text: str = model.SourceField()
|
||||
@@ -182,3 +182,23 @@ def test_instructor_embedding(tmp_path):
|
||||
|
||||
tbl.add(df)
|
||||
assert len(tbl.to_pandas()["vector"][0]) == model.ndims()
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("GOOGLE_API_KEY") is None, reason="GOOGLE_API_KEY not set"
|
||||
)
|
||||
def test_gemini_embedding(tmp_path):
|
||||
model = get_registry().get("gemini-text").create(max_retries=0)
|
||||
|
||||
class TextModel(LanceModel):
|
||||
text: str = model.SourceField()
|
||||
vector: Vector(model.ndims()) = model.VectorField()
|
||||
|
||||
df = pd.DataFrame({"text": ["hello world", "goodbye world"]})
|
||||
db = lancedb.connect(tmp_path)
|
||||
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
|
||||
|
||||
tbl.add(df)
|
||||
assert len(tbl.to_pandas()["vector"][0]) == model.ndims()
|
||||
assert tbl.search("hello").limit(1).to_pandas()["text"][0] == "hello world"
|
||||
|
||||
@@ -20,6 +20,7 @@ from unittest.mock import PropertyMock, patch
|
||||
import lance
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
from pydantic import BaseModel
|
||||
@@ -182,6 +183,46 @@ def test_add_pydantic_model(db):
|
||||
assert len(really_flattened.columns) == 7
|
||||
|
||||
|
||||
def test_polars(db):
|
||||
data = {
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0],
|
||||
}
|
||||
# Ingest polars dataframe
|
||||
table = LanceTable.create(db, "test", data=pl.DataFrame(data))
|
||||
assert len(table) == 2
|
||||
|
||||
result = table.to_pandas()
|
||||
assert np.allclose(result["vector"].tolist(), data["vector"])
|
||||
assert result["item"].tolist() == data["item"]
|
||||
assert np.allclose(result["price"].tolist(), data["price"])
|
||||
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
pa.field("item", pa.large_string()),
|
||||
pa.field("price", pa.float64()),
|
||||
]
|
||||
)
|
||||
assert table.schema == schema
|
||||
|
||||
# search results to polars dataframe
|
||||
q = [3.1, 4.1]
|
||||
result = table.search(q).limit(1).to_polars()
|
||||
assert np.allclose(result["vector"][0], q)
|
||||
assert result["item"][0] == "foo"
|
||||
assert np.allclose(result["price"][0], 10.0)
|
||||
|
||||
# enter table to polars dataframe
|
||||
result = table.to_polars()
|
||||
assert np.allclose(result.collect()["vector"].to_list(), data["vector"])
|
||||
|
||||
# make sure filtering isn't broken
|
||||
filtered_result = result.filter(pl.col("item").is_in(["foo", "bar"])).collect()
|
||||
assert len(filtered_result) == 2
|
||||
|
||||
|
||||
def _add(table, schema):
|
||||
# table = LanceTable(db, "test")
|
||||
assert len(table) == 2
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "vectordb-node"
|
||||
version = "0.4.2"
|
||||
version = "0.4.3"
|
||||
description = "Serverless, low-latency vector database for AI applications"
|
||||
license = "Apache-2.0"
|
||||
edition = "2018"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "vectordb"
|
||||
version = "0.4.2"
|
||||
version = "0.4.3"
|
||||
edition = "2021"
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license = "Apache-2.0"
|
||||
|
||||
Reference in New Issue
Block a user