Compare commits

...

10 Commits

Author SHA1 Message Date
Lance Release
8bcdc81fd3 [python] Bump version: 0.4.4 → 0.5.0 2024-01-18 01:53:15 +00:00
Chang She
39e14c70c5 chore(python): turn off lazy frame ingestion (#821) 2024-01-16 19:11:16 -08:00
Chang She
af8263af94 feat(python): allow the entire table to be converted a polars dataframe (#814) 2024-01-15 15:49:16 -08:00
Chang She
be4ab9eef3 feat(python): add exist_ok option to create table (#813)
This mimics CREATE TABLE IF NOT EXISTS behavior.
We add `db.create_table(..., exist_ok=True)` parameter.
By default it is set to False, so trying to create
a table with the same name will raise an exception.
If set to True, then it only opens the table if it
already exists. If you pass in a schema, it will
be checked against the existing table to make sure
you get what you want. If you pass in data, it will
NOT be added to the existing table.
2024-01-15 11:09:18 -08:00
Ayush Chaurasia
184d2bc969 chore(python): get rid of Pydantic deprication warning in embedding fcn (#816)
```
UserWarning: Valid config keys have changed in V2:
* 'keep_untouched' has been renamed to 'ignored_types' warnings.warn(message, UserWarning)
```
2024-01-15 12:19:51 +05:30
Anton Shevtsov
ff6f005336 Add openai api key not found help (#815)
This pull request adds check for the presence of an environment variable
`OPENAI_API_KEY` and removes an unused parameter in
`retry_with_exponential_backoff` function.
2024-01-15 02:44:09 +05:30
Chang She
49333e522c feat(python): basic polars integration (#811)
We should now be able to directly ingest polars dataframes and return
results as polars dataframes


![image](https://github.com/lancedb/lancedb/assets/759245/828b1260-c791-45f1-a047-aa649575e798)
2024-01-13 16:38:16 -08:00
Ayush Chaurasia
4568df422d feat(python): Add gemini text embedding function (#806)
Named it Gemini-text for now. Not sure how complicated it will be to
support both text and multimodal embeddings under the same class
"gemini"..But its not something to worry about for now I guess.
2024-01-12 22:38:55 -08:00
Lance Release
986891db98 Updating package-lock.json 2024-01-11 22:21:42 +00:00
Lance Release
036bf02901 Updating package-lock.json 2024-01-11 21:34:04 +00:00
17 changed files with 414 additions and 53 deletions

View File

@@ -67,7 +67,7 @@ We'll cover the basics of using LanceDB on your local machine in this section.
!!! warning !!! warning
If the table already exists, LanceDB will raise an error by default. If the table already exists, LanceDB will raise an error by default.
If you want to overwrite the table, you can pass in `mode="overwrite"` If you want to make sure you overwrite the table, pass in `mode="overwrite"`
to the `createTable` function. to the `createTable` function.
=== "Javascript" === "Javascript"

View File

@@ -118,6 +118,42 @@ texts = [{"text": "Capitalism has been dominant in the Western world since the e
tbl.add(texts) tbl.add(texts)
``` ```
## Gemini Embedding Function
With Google's Gemini, you can represent text (words, sentences, and blocks of text) in a vectorized form, making it easier to compare and contrast embeddings. For example, two texts that share a similar subject matter or sentiment should have similar embeddings, which can be identified through mathematical comparison techniques such as cosine similarity. For more on how and why you should use embeddings, refer to the Embeddings guide.
The Gemini Embedding Model API supports various task types:
| Task Type | Description |
|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
| "`retrieval_query`" | Specifies the given text is a query in a search/retrieval setting. |
| "`retrieval_document`" | Specifies the given text is a document in a search/retrieval setting. Using this task type requires a title but is automatically proided by Embeddings API |
| "`semantic_similarity`" | Specifies the given text will be used for Semantic Textual Similarity (STS). |
| "`classification`" | Specifies that the embeddings will be used for classification. |
| "`clusering`" | Specifies that the embeddings will be used for clustering. |
Usage Example:
```python
import lancedb
import pandas as pd
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
model = get_registry().get("gemini-text").create()
class TextModel(LanceModel):
text: str = model.SourceField()
vector: Vector(model.ndims()) = model.VectorField()
df = pd.DataFrame({"text": ["hello world", "goodbye world"]})
db = lancedb.connect("~/.lancedb")
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
tbl.add(df)
rs = tbl.search("hello").limit(1).to_pandas()
```
## Multi-modal embedding functions ## Multi-modal embedding functions
Multi-modal embedding functions allow you to query your table using both images and text. Multi-modal embedding functions allow you to query your table using both images and text.

View File

@@ -31,13 +31,23 @@ This guide will show how to create tables, insert data into them, and update the
``` ```
!!! info "Note" !!! info "Note"
If the table already exists, LanceDB will raise an error by default. If you want to overwrite the table, you can pass in mode="overwrite" to the createTable function. If the table already exists, LanceDB will raise an error by default.
`create_table` supports an optional `exist_ok` parameter. When set to True
and the table exists, then it simply opens the existing table. The data you
passed in will NOT be appended to the table in that case.
```python
db.create_table("name", data, exist_ok=True)
```
Sometimes you want to make sure that you start fresh. If you want to
overwrite the table, you can pass in mode="overwrite" to the createTable function.
```python ```python
db.create_table("name", data, mode="overwrite") db.create_table("name", data, mode="overwrite")
``` ```
### From pandas DataFrame ### From pandas DataFrame
```python ```python

74
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{ {
"name": "vectordb", "name": "vectordb",
"version": "0.4.2", "version": "0.4.3",
"lockfileVersion": 2, "lockfileVersion": 2,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "vectordb", "name": "vectordb",
"version": "0.4.2", "version": "0.4.3",
"cpu": [ "cpu": [
"x64", "x64",
"arm64" "arm64"
@@ -53,11 +53,11 @@
"uuid": "^9.0.0" "uuid": "^9.0.0"
}, },
"optionalDependencies": { "optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.4.2", "@lancedb/vectordb-darwin-arm64": "0.4.3",
"@lancedb/vectordb-darwin-x64": "0.4.2", "@lancedb/vectordb-darwin-x64": "0.4.3",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.2", "@lancedb/vectordb-linux-arm64-gnu": "0.4.3",
"@lancedb/vectordb-linux-x64-gnu": "0.4.2", "@lancedb/vectordb-linux-x64-gnu": "0.4.3",
"@lancedb/vectordb-win32-x64-msvc": "0.4.2" "@lancedb/vectordb-win32-x64-msvc": "0.4.3"
} }
}, },
"node_modules/@75lb/deep-merge": { "node_modules/@75lb/deep-merge": {
@@ -337,9 +337,9 @@
} }
}, },
"node_modules/@lancedb/vectordb-darwin-arm64": { "node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.4.2", "version": "0.4.3",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.2.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.3.tgz",
"integrity": "sha512-Ec73W2IHnZK4VC8g/7JyLbgcwcpNb9YI20yEhfTjEEFjJKoElZhDD/ZgghC3QQSRnrXFTxDzPK1V9BDT5QB2Hg==", "integrity": "sha512-47CvvSaV1EdUsFEpXUJApTk+hMzAhCxVizipCFUlXCgcmzpCDL86wNgJij/X9a+j6zADhIX//Lsu0qd/an/Bpw==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@@ -349,9 +349,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-darwin-x64": { "node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.4.2", "version": "0.4.3",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.2.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.3.tgz",
"integrity": "sha512-tj0JJlOfOdeSAfmM7EZhrhFdCFjoq9Bmrjt4741BNjtF+Nv4Otl53lFtUQrexTr4oh/E1yY1qaydJ7K++8u3UA==", "integrity": "sha512-UlZZv8CmJIuRJNJG+Y1VmFsGyPR8W/72Q5EwgMMsSES6zpMQ9pNdBDWhL3UGX6nMRgnbprkwYiWJ3xHhJvtqtw==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@@ -361,9 +361,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-linux-arm64-gnu": { "node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.4.2", "version": "0.4.3",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.2.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.3.tgz",
"integrity": "sha512-OQ7ra5Q5RrLLwxIyI338KfQ2sSl8NJfqAHWvwiMtjCYFFYxIJGjX7U0I2MjSEPqJ5/ZoyjV4mjsvs0G1q20u+Q==", "integrity": "sha512-L6NVJr/lKEd8+904FzZNpT8BGQMs2cHNYbGJMIaVvGnMiIJgKAFKtOyGtdDjoe1xRZoEw21yjRGksGbnRO5wHQ==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@@ -373,9 +373,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-linux-x64-gnu": { "node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.4.2", "version": "0.4.3",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.2.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.3.tgz",
"integrity": "sha512-9tgIFSOYqNJzonnYsQr7v2gGdJm8aZ62UsVX2SWAIVhypoP4A05tAlbzjBgKO3R5xy5gpcW8tt/Pt8IsYWON7Q==", "integrity": "sha512-OBx3WF3pK0xNfFJeErmuD9R2QWLa3XdeZspyTsIrQmBDeKj3HKh8y7Scpx4NH5Y09+9JNqRRKRZN7OqWTYhITg==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@@ -385,9 +385,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-win32-x64-msvc": { "node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.4.2", "version": "0.4.3",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.2.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.3.tgz",
"integrity": "sha512-jhG3MqZ3r8BexXANLRNX57RAnCZT9psdSBORG3KTu5qe2xaunRlJNSA2kk8a79tf+gtUT/BAmMiXMzAi/dwq8w==", "integrity": "sha512-n9IvR81NXZKnSN91mrgeXbEyCiGM+YLJpOgbdHoEtMP04VDnS+iSU4jGOtQBKErvWeCJQaGFQ9qzdcVchpRGyw==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@@ -4840,33 +4840,33 @@
} }
}, },
"@lancedb/vectordb-darwin-arm64": { "@lancedb/vectordb-darwin-arm64": {
"version": "0.4.2", "version": "0.4.3",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.2.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.3.tgz",
"integrity": "sha512-Ec73W2IHnZK4VC8g/7JyLbgcwcpNb9YI20yEhfTjEEFjJKoElZhDD/ZgghC3QQSRnrXFTxDzPK1V9BDT5QB2Hg==", "integrity": "sha512-47CvvSaV1EdUsFEpXUJApTk+hMzAhCxVizipCFUlXCgcmzpCDL86wNgJij/X9a+j6zADhIX//Lsu0qd/an/Bpw==",
"optional": true "optional": true
}, },
"@lancedb/vectordb-darwin-x64": { "@lancedb/vectordb-darwin-x64": {
"version": "0.4.2", "version": "0.4.3",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.2.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.3.tgz",
"integrity": "sha512-tj0JJlOfOdeSAfmM7EZhrhFdCFjoq9Bmrjt4741BNjtF+Nv4Otl53lFtUQrexTr4oh/E1yY1qaydJ7K++8u3UA==", "integrity": "sha512-UlZZv8CmJIuRJNJG+Y1VmFsGyPR8W/72Q5EwgMMsSES6zpMQ9pNdBDWhL3UGX6nMRgnbprkwYiWJ3xHhJvtqtw==",
"optional": true "optional": true
}, },
"@lancedb/vectordb-linux-arm64-gnu": { "@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.4.2", "version": "0.4.3",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.2.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.3.tgz",
"integrity": "sha512-OQ7ra5Q5RrLLwxIyI338KfQ2sSl8NJfqAHWvwiMtjCYFFYxIJGjX7U0I2MjSEPqJ5/ZoyjV4mjsvs0G1q20u+Q==", "integrity": "sha512-L6NVJr/lKEd8+904FzZNpT8BGQMs2cHNYbGJMIaVvGnMiIJgKAFKtOyGtdDjoe1xRZoEw21yjRGksGbnRO5wHQ==",
"optional": true "optional": true
}, },
"@lancedb/vectordb-linux-x64-gnu": { "@lancedb/vectordb-linux-x64-gnu": {
"version": "0.4.2", "version": "0.4.3",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.2.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.3.tgz",
"integrity": "sha512-9tgIFSOYqNJzonnYsQr7v2gGdJm8aZ62UsVX2SWAIVhypoP4A05tAlbzjBgKO3R5xy5gpcW8tt/Pt8IsYWON7Q==", "integrity": "sha512-OBx3WF3pK0xNfFJeErmuD9R2QWLa3XdeZspyTsIrQmBDeKj3HKh8y7Scpx4NH5Y09+9JNqRRKRZN7OqWTYhITg==",
"optional": true "optional": true
}, },
"@lancedb/vectordb-win32-x64-msvc": { "@lancedb/vectordb-win32-x64-msvc": {
"version": "0.4.2", "version": "0.4.3",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.2.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.3.tgz",
"integrity": "sha512-jhG3MqZ3r8BexXANLRNX57RAnCZT9psdSBORG3KTu5qe2xaunRlJNSA2kk8a79tf+gtUT/BAmMiXMzAi/dwq8w==", "integrity": "sha512-n9IvR81NXZKnSN91mrgeXbEyCiGM+YLJpOgbdHoEtMP04VDnS+iSU4jGOtQBKErvWeCJQaGFQ9qzdcVchpRGyw==",
"optional": true "optional": true
}, },
"@neon-rs/cli": { "@neon-rs/cli": {

View File

@@ -1,5 +1,5 @@
[bumpversion] [bumpversion]
current_version = 0.4.4 current_version = 0.5.0
commit = True commit = True
message = [python] Bump version: {current_version} → {new_version} message = [python] Bump version: {current_version} → {new_version}
tag = True tag = True

View File

@@ -56,6 +56,7 @@ class DBConnection(EnforceOverrides):
data: Optional[DATA] = None, data: Optional[DATA] = None,
schema: Optional[Union[pa.Schema, LanceModel]] = None, schema: Optional[Union[pa.Schema, LanceModel]] = None,
mode: str = "create", mode: str = "create",
exist_ok: bool = False,
on_bad_vectors: str = "error", on_bad_vectors: str = "error",
fill_value: float = 0.0, fill_value: float = 0.0,
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None, embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
@@ -86,6 +87,11 @@ class DBConnection(EnforceOverrides):
Can be either "create" or "overwrite". Can be either "create" or "overwrite".
By default, if the table already exists, an exception is raised. By default, if the table already exists, an exception is raised.
If you want to overwrite the table, use mode="overwrite". If you want to overwrite the table, use mode="overwrite".
exist_ok: bool, default False
If a table by the same name already exists, then raise an exception
if exist_ok=False. If exist_ok=True, then open the existing table;
it will not add the provided data but will validate against any
schema that's specified.
on_bad_vectors: str, default "error" on_bad_vectors: str, default "error"
What to do if any of the vectors are not the same size or contains NaNs. What to do if any of the vectors are not the same size or contains NaNs.
One of "error", "drop", "fill". One of "error", "drop", "fill".
@@ -319,6 +325,7 @@ class LanceDBConnection(DBConnection):
data: Optional[DATA] = None, data: Optional[DATA] = None,
schema: Optional[Union[pa.Schema, LanceModel]] = None, schema: Optional[Union[pa.Schema, LanceModel]] = None,
mode: str = "create", mode: str = "create",
exist_ok: bool = False,
on_bad_vectors: str = "error", on_bad_vectors: str = "error",
fill_value: float = 0.0, fill_value: float = 0.0,
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None, embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
@@ -338,6 +345,7 @@ class LanceDBConnection(DBConnection):
data, data,
schema, schema,
mode=mode, mode=mode,
exist_ok=exist_ok,
on_bad_vectors=on_bad_vectors, on_bad_vectors=on_bad_vectors,
fill_value=fill_value, fill_value=fill_value,
embedding_functions=embedding_functions, embedding_functions=embedding_functions,

View File

@@ -19,4 +19,5 @@ from .open_clip import OpenClipEmbeddings
from .openai import OpenAIEmbeddings from .openai import OpenAIEmbeddings
from .registry import EmbeddingFunctionRegistry, get_registry from .registry import EmbeddingFunctionRegistry, get_registry
from .sentence_transformers import SentenceTransformerEmbeddings from .sentence_transformers import SentenceTransformerEmbeddings
from .gemini_text import GeminiText
from .utils import with_embeddings from .utils import with_embeddings

View File

@@ -0,0 +1,131 @@
# Copyright (c) 2023. LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from functools import cached_property
from typing import List, Union, Any
import numpy as np
from .base import TextEmbeddingFunction
from .registry import register
from .utils import api_key_not_found_help, TEXT
from lancedb.pydantic import PYDANTIC_VERSION
@register("gemini-text")
class GeminiText(TextEmbeddingFunction):
"""
An embedding function that uses the Google's Gemini API. Requires GOOGLE_API_KEY to be set.
https://ai.google.dev/docs/embeddings_guide
Supports various tasks types:
| Task Type | Description |
|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
| "`retrieval_query`" | Specifies the given text is a query in a search/retrieval setting. |
| "`retrieval_document`" | Specifies the given text is a document in a search/retrieval setting. Using this task type requires a title but is automatically proided by Embeddings API |
| "`semantic_similarity`" | Specifies the given text will be used for Semantic Textual Similarity (STS). |
| "`classification`" | Specifies that the embeddings will be used for classification. |
| "`clusering`" | Specifies that the embeddings will be used for clustering. |
Note: The supported task types might change in the Gemini API, but as long as a supported task type and its argument set is provided,
those will be delegated to the API calls.
Parameters
----------
name: str, default "models/embedding-001"
The name of the model to use. See the Gemini documentation for a list of available models.
query_task_type: str, default "retrieval_query"
Sets the task type for the queries.
source_task_type: str, default "retrieval_document"
Sets the task type for ingestion.
Examples
--------
import lancedb
import pandas as pd
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
model = get_registry().get("gemini-text").create()
class TextModel(LanceModel):
text: str = model.SourceField()
vector: Vector(model.ndims()) = model.VectorField()
df = pd.DataFrame({"text": ["hello world", "goodbye world"]})
db = lancedb.connect("~/.lancedb")
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
tbl.add(df)
rs = tbl.search("hello").limit(1).to_pandas()
"""
name: str = "models/embedding-001"
query_task_type: str = "retrieval_query"
source_task_type: str = "retrieval_document"
if PYDANTIC_VERSION < (2, 0): # Pydantic 1.x compat
class Config:
keep_untouched = (cached_property,)
def ndims(self):
# TODO: fix hardcoding
return 768
def compute_query_embeddings(self, query: str, *args, **kwargs) -> List[np.array]:
return self.compute_source_embeddings(query, task_type=self.query_task_type)
def compute_source_embeddings(self, texts: TEXT, *args, **kwargs) -> List[np.array]:
texts = self.sanitize_input(texts)
task_type = (
kwargs.get("task_type") or self.source_task_type
) # assume source task type if not passed by `compute_query_embeddings`
return self.generate_embeddings(texts, task_type=task_type)
def generate_embeddings(
self, texts: Union[List[str], np.ndarray], *args, **kwargs
) -> List[np.array]:
"""
Get the embeddings for the given texts
Parameters
----------
texts: list[str] or np.ndarray (of str)
The texts to embed
"""
if (
kwargs.get("task_type") == "retrieval_document"
): # Provide a title to use existing API design
title = "Embedding of a document"
kwargs["title"] = title
return [
self.client.embed_content(model=self.name, content=text, **kwargs)[
"embedding"
]
for text in texts
]
@cached_property
def client(self):
genai = self.safe_import("google.generativeai", "google.generativeai")
if not os.environ.get("GOOGLE_API_KEY"):
api_key_not_found_help("google")
return genai

View File

@@ -10,6 +10,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
from functools import cached_property from functools import cached_property
from typing import List, Union from typing import List, Union
@@ -17,6 +18,7 @@ import numpy as np
from .base import TextEmbeddingFunction from .base import TextEmbeddingFunction
from .registry import register from .registry import register
from .utils import api_key_not_found_help
@register("openai") @register("openai")
@@ -51,4 +53,7 @@ class OpenAIEmbeddings(TextEmbeddingFunction):
@cached_property @cached_property
def _openai_client(self): def _openai_client(self):
openai = self.safe_import("openai") openai = self.safe_import("openai")
if not os.environ.get("OPENAI_API_KEY"):
api_key_not_found_help("openai")
return openai.OpenAI() return openai.OpenAI()

View File

@@ -216,7 +216,6 @@ def retry_with_exponential_backoff(
exponential_base: float = 2, exponential_base: float = 2,
jitter: bool = True, jitter: bool = True,
max_retries: int = 7, max_retries: int = 7,
# errors: tuple = (),
): ):
"""Retry a function with exponential backoff. """Retry a function with exponential backoff.
@@ -226,7 +225,6 @@ def retry_with_exponential_backoff(
exponential_base (float): The base for exponential backoff (default is 2). exponential_base (float): The base for exponential backoff (default is 2).
jitter (bool): Whether to add jitter to the delay (default is True). jitter (bool): Whether to add jitter to the delay (default is True).
max_retries (int): Maximum number of retries (default is 10). max_retries (int): Maximum number of retries (default is 10).
errors (tuple): Tuple of specific exceptions to retry on (default is (openai.error.RateLimitError,)).
Returns: Returns:
function: The decorated function. function: The decorated function.

View File

@@ -260,6 +260,17 @@ class LanceQueryBuilder(ABC):
for row in self.to_arrow().to_pylist() for row in self.to_arrow().to_pylist()
] ]
def to_polars(self) -> "pl.DataFrame":
"""
Execute the query and return the results as a Polars DataFrame.
In addition to the selected columns, LanceDB also returns a vector
and also the "_distance" column which is the distance between the query
vector and the returned vector.
"""
import polars as pl
return pl.from_arrow(self.to_arrow())
def limit(self, limit: Union[int, None]) -> LanceQueryBuilder: def limit(self, limit: Union[int, None]) -> LanceQueryBuilder:
"""Set the maximum number of results to return. """Set the maximum number of results to return.

View File

@@ -31,7 +31,13 @@ from .common import DATA, VEC, VECTOR_COLUMN_NAME
from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
from .pydantic import LanceModel, model_to_dict from .pydantic import LanceModel, model_to_dict
from .query import LanceQueryBuilder, Query from .query import LanceQueryBuilder, Query
from .util import fs_from_uri, safe_import_pandas, value_to_sql, join_uri from .util import (
fs_from_uri,
safe_import_pandas,
safe_import_polars,
value_to_sql,
join_uri,
)
from .utils.events import register_event from .utils.events import register_event
if TYPE_CHECKING: if TYPE_CHECKING:
@@ -41,6 +47,7 @@ if TYPE_CHECKING:
pd = safe_import_pandas() pd = safe_import_pandas()
pl = safe_import_polars()
def _sanitize_data( def _sanitize_data(
@@ -66,6 +73,8 @@ def _sanitize_data(
meta = data.schema.metadata if data.schema.metadata is not None else {} meta = data.schema.metadata if data.schema.metadata is not None else {}
meta = {k: v for k, v in meta.items() if k != b"pandas"} meta = {k: v for k, v in meta.items() if k != b"pandas"}
data = data.replace_schema_metadata(meta) data = data.replace_schema_metadata(meta)
elif pl is not None and isinstance(data, pl.DataFrame):
data = data.to_arrow()
if isinstance(data, pa.Table): if isinstance(data, pa.Table):
if metadata: if metadata:
@@ -688,6 +697,30 @@ class LanceTable(Table):
pa.Table""" pa.Table"""
return self._dataset.to_table() return self._dataset.to_table()
def to_polars(self, batch_size=None) -> "pl.LazyFrame":
"""Return the table as a polars LazyFrame.
Parameters
----------
batch_size: int, optional
Passed to polars. This is the maximum row count for
scanned pyarrow record batches
Note
----
1. This requires polars to be installed separately
2. Currently we've disabled push-down of the filters from polars
because polars pushdown into pyarrow uses pyarrow compute
expressions rather than SQl strings (which LanceDB supports)
Returns
-------
pl.LazyFrame
"""
return pl.scan_pyarrow_dataset(
self.to_lance(), allow_pyarrow_filter=False, batch_size=batch_size
)
@property @property
def _dataset_uri(self) -> str: def _dataset_uri(self) -> str:
return join_uri(self._conn.uri, f"{self.name}.lance") return join_uri(self._conn.uri, f"{self.name}.lance")
@@ -963,6 +996,7 @@ class LanceTable(Table):
data=None, data=None,
schema=None, schema=None,
mode="create", mode="create",
exist_ok=False,
on_bad_vectors: str = "error", on_bad_vectors: str = "error",
fill_value: float = 0.0, fill_value: float = 0.0,
embedding_functions: List[EmbeddingFunctionConfig] = None, embedding_functions: List[EmbeddingFunctionConfig] = None,
@@ -1002,6 +1036,10 @@ class LanceTable(Table):
mode: str, default "create" mode: str, default "create"
The mode to use when writing the data. Valid values are The mode to use when writing the data. Valid values are
"create", "overwrite", and "append". "create", "overwrite", and "append".
exist_ok: bool, default False
If the table already exists then raise an error if False,
otherwise just open the table, it will not add the provided
data but will validate against any schema that's specified.
on_bad_vectors: str, default "error" on_bad_vectors: str, default "error"
What to do if any of the vectors are not the same size or contains NaNs. What to do if any of the vectors are not the same size or contains NaNs.
One of "error", "drop", "fill". One of "error", "drop", "fill".
@@ -1052,14 +1090,24 @@ class LanceTable(Table):
schema = schema.with_metadata(metadata) schema = schema.with_metadata(metadata)
empty = pa.Table.from_pylist([], schema=schema) empty = pa.Table.from_pylist([], schema=schema)
try:
lance.write_dataset(empty, tbl._dataset_uri, schema=schema, mode=mode) lance.write_dataset(empty, tbl._dataset_uri, schema=schema, mode=mode)
table = LanceTable(db, name) except OSError as err:
if "Dataset already exists" in str(err) and exist_ok:
if tbl.schema != schema:
raise ValueError(
f"Table {name} already exists with a different schema"
)
return tbl
raise
new_table = LanceTable(db, name)
if data is not None: if data is not None:
table.add(data) new_table.add(data)
register_event("create_table") register_event("create_table")
return table return new_table
@classmethod @classmethod
def open(cls, db, name): def open(cls, db, name):
@@ -1276,7 +1324,8 @@ def _sanitize_vector_column(
""" """
# ChunkedArray is annoying to work with, so we combine chunks here # ChunkedArray is annoying to work with, so we combine chunks here
vec_arr = data[vector_column_name].combine_chunks() vec_arr = data[vector_column_name].combine_chunks()
if pa.types.is_list(data[vector_column_name].type): typ = data[vector_column_name].type
if pa.types.is_list(typ) or pa.types.is_large_list(typ):
# if it's a variable size list array, # if it's a variable size list array,
# we make sure the dimensions are all the same # we make sure the dimensions are all the same
has_jagged_ndims = len(vec_arr.values) % len(data) != 0 has_jagged_ndims = len(vec_arr.values) % len(data) != 0

View File

@@ -123,6 +123,15 @@ def safe_import_pandas():
return None return None
def safe_import_polars():
try:
import polars as pl
return pl
except ImportError:
return None
@singledispatch @singledispatch
def value_to_sql(value): def value_to_sql(value):
raise NotImplementedError("SQL conversion is not implemented for this type") raise NotImplementedError("SQL conversion is not implemented for this type")

View File

@@ -1,6 +1,6 @@
[project] [project]
name = "lancedb" name = "lancedb"
version = "0.4.4" version = "0.5.0"
dependencies = [ dependencies = [
"deprecation", "deprecation",
"pylance==0.9.6", "pylance==0.9.6",
@@ -48,7 +48,7 @@ classifiers = [
repository = "https://github.com/lancedb/lancedb" repository = "https://github.com/lancedb/lancedb"
[project.optional-dependencies] [project.optional-dependencies]
tests = ["aiohttp", "pandas>=1.4", "pytest", "pytest-mock", "pytest-asyncio", "duckdb", "pytz"] tests = ["aiohttp", "pandas>=1.4", "pytest", "pytest-mock", "pytest-asyncio", "duckdb", "pytz", "polars"]
dev = ["ruff", "pre-commit"] dev = ["ruff", "pre-commit"]
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"] docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
clip = ["torch", "pillow", "open-clip"] clip = ["torch", "pillow", "open-clip"]

View File

@@ -190,6 +190,48 @@ def test_create_mode(tmp_path):
assert tbl.to_pandas().item.tolist() == ["fizz", "buzz"] assert tbl.to_pandas().item.tolist() == ["fizz", "buzz"]
def test_create_exist_ok(tmp_path):
db = lancedb.connect(tmp_path)
data = pd.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0],
}
)
tbl = db.create_table("test", data=data)
with pytest.raises(OSError):
db.create_table("test", data=data)
# open the table but don't add more rows
tbl2 = db.create_table("test", data=data, exist_ok=True)
assert tbl.name == tbl2.name
assert tbl.schema == tbl2.schema
assert len(tbl) == len(tbl2)
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), list_size=2)),
pa.field("item", pa.utf8()),
pa.field("price", pa.float64()),
]
)
tbl3 = db.create_table("test", schema=schema, exist_ok=True)
assert tbl3.schema == schema
bad_schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), list_size=2)),
pa.field("item", pa.utf8()),
pa.field("price", pa.float64()),
pa.field("extra", pa.float32()),
]
)
with pytest.raises(ValueError):
db.create_table("test", schema=bad_schema, exist_ok=True)
def test_delete_table(tmp_path): def test_delete_table(tmp_path):
db = lancedb.connect(tmp_path) db = lancedb.connect(tmp_path)
data = pd.DataFrame( data = pd.DataFrame(

View File

@@ -89,7 +89,7 @@ def test_openclip(tmp_path):
db = lancedb.connect(tmp_path) db = lancedb.connect(tmp_path)
registry = get_registry() registry = get_registry()
func = registry.get("open-clip").create() func = registry.get("open-clip").create(max_retries=0)
class Images(LanceModel): class Images(LanceModel):
label: str label: str
@@ -170,7 +170,7 @@ def test_cohere_embedding_function():
@pytest.mark.slow @pytest.mark.slow
def test_instructor_embedding(tmp_path): def test_instructor_embedding(tmp_path):
model = get_registry().get("instructor").create() model = get_registry().get("instructor").create(max_retries=0)
class TextModel(LanceModel): class TextModel(LanceModel):
text: str = model.SourceField() text: str = model.SourceField()
@@ -182,3 +182,23 @@ def test_instructor_embedding(tmp_path):
tbl.add(df) tbl.add(df)
assert len(tbl.to_pandas()["vector"][0]) == model.ndims() assert len(tbl.to_pandas()["vector"][0]) == model.ndims()
@pytest.mark.slow
@pytest.mark.skipif(
os.environ.get("GOOGLE_API_KEY") is None, reason="GOOGLE_API_KEY not set"
)
def test_gemini_embedding(tmp_path):
model = get_registry().get("gemini-text").create(max_retries=0)
class TextModel(LanceModel):
text: str = model.SourceField()
vector: Vector(model.ndims()) = model.VectorField()
df = pd.DataFrame({"text": ["hello world", "goodbye world"]})
db = lancedb.connect(tmp_path)
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
tbl.add(df)
assert len(tbl.to_pandas()["vector"][0]) == model.ndims()
assert tbl.search("hello").limit(1).to_pandas()["text"][0] == "hello world"

View File

@@ -20,6 +20,7 @@ from unittest.mock import PropertyMock, patch
import lance import lance
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import polars as pl
import pyarrow as pa import pyarrow as pa
import pytest import pytest
from pydantic import BaseModel from pydantic import BaseModel
@@ -182,6 +183,46 @@ def test_add_pydantic_model(db):
assert len(really_flattened.columns) == 7 assert len(really_flattened.columns) == 7
def test_polars(db):
data = {
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0],
}
# Ingest polars dataframe
table = LanceTable.create(db, "test", data=pl.DataFrame(data))
assert len(table) == 2
result = table.to_pandas()
assert np.allclose(result["vector"].tolist(), data["vector"])
assert result["item"].tolist() == data["item"]
assert np.allclose(result["price"].tolist(), data["price"])
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 2)),
pa.field("item", pa.large_string()),
pa.field("price", pa.float64()),
]
)
assert table.schema == schema
# search results to polars dataframe
q = [3.1, 4.1]
result = table.search(q).limit(1).to_polars()
assert np.allclose(result["vector"][0], q)
assert result["item"][0] == "foo"
assert np.allclose(result["price"][0], 10.0)
# enter table to polars dataframe
result = table.to_polars()
assert np.allclose(result.collect()["vector"].to_list(), data["vector"])
# make sure filtering isn't broken
filtered_result = result.filter(pl.col("item").is_in(["foo", "bar"])).collect()
assert len(filtered_result) == 2
def _add(table, schema): def _add(table, schema):
# table = LanceTable(db, "test") # table = LanceTable(db, "test")
assert len(table) == 2 assert len(table) == 2