mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-12 06:42:56 +00:00
fix: use pandas with pydantic embedding column (#1818)
* Make Pandas `DataFrame` works with embedding function + Subset of columns * Make `lancedb.create_table()` work with embedding function
This commit is contained in:
@@ -1,15 +1,6 @@
|
||||
# Copyright 2023 LanceDB Developers
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
from typing import List, Union
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
@@ -18,6 +9,7 @@ import lancedb
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from lancedb.conftest import MockTextEmbeddingFunction
|
||||
from lancedb.embeddings import (
|
||||
EmbeddingFunctionConfig,
|
||||
@@ -129,6 +121,142 @@ def test_embedding_with_bad_results(tmp_path):
|
||||
# assert tbl["vector"].null_count == 1
|
||||
|
||||
|
||||
def test_with_existing_vectors(tmp_path):
|
||||
@register("mock-embedding")
|
||||
class MockEmbeddingFunction(TextEmbeddingFunction):
|
||||
def ndims(self):
|
||||
return 128
|
||||
|
||||
def generate_embeddings(
|
||||
self, texts: Union[List[str], np.ndarray]
|
||||
) -> List[np.array]:
|
||||
return [np.random.randn(self.ndims()).tolist() for _ in range(len(texts))]
|
||||
|
||||
registry = get_registry()
|
||||
model = registry.get("mock-embedding").create()
|
||||
|
||||
class Schema(LanceModel):
|
||||
text: str = model.SourceField()
|
||||
vector: Vector(model.ndims()) = model.VectorField()
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
tbl = db.create_table("test", schema=Schema, mode="overwrite")
|
||||
tbl.add([{"text": "hello world", "vector": np.zeros(128).tolist()}])
|
||||
|
||||
embeddings = tbl.to_arrow()["vector"].to_pylist()
|
||||
assert not np.any(embeddings), "all zeros"
|
||||
|
||||
|
||||
def test_embedding_function_with_pandas(tmp_path):
|
||||
@register("mock-embedding")
|
||||
class _MockEmbeddingFunction(TextEmbeddingFunction):
|
||||
def ndims(self):
|
||||
return 128
|
||||
|
||||
def generate_embeddings(
|
||||
self, texts: Union[List[str], np.ndarray]
|
||||
) -> List[np.array]:
|
||||
return [np.random.randn(self.ndims()).tolist() for _ in range(len(texts))]
|
||||
|
||||
registery = get_registry()
|
||||
func = registery.get("mock-embedding").create()
|
||||
|
||||
class TestSchema(LanceModel):
|
||||
text: str = func.SourceField()
|
||||
val: int
|
||||
vector: Vector(func.ndims()) = func.VectorField()
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"text": ["hello world", "goodbye world"],
|
||||
"val": [1, 2],
|
||||
"not-used": ["s1", "s3"],
|
||||
}
|
||||
)
|
||||
db = lancedb.connect(tmp_path)
|
||||
tbl = db.create_table("test", schema=TestSchema, mode="overwrite", data=df)
|
||||
schema = tbl.schema
|
||||
assert schema.field("text").type == pa.string()
|
||||
assert schema.field("val").type == pa.int64()
|
||||
assert schema.field("vector").type == pa.list_(pa.float32(), 128)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"text": ["extra", "more"],
|
||||
"val": [4, 5],
|
||||
"misc-col": ["s1", "s3"],
|
||||
}
|
||||
)
|
||||
tbl.add(df)
|
||||
|
||||
assert tbl.count_rows() == 4
|
||||
embeddings = tbl.to_arrow()["vector"]
|
||||
assert embeddings.null_count == 0
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"text": ["with", "embeddings"],
|
||||
"val": [6, 7],
|
||||
"vector": [np.zeros(128).tolist(), np.zeros(128).tolist()],
|
||||
}
|
||||
)
|
||||
tbl.add(df)
|
||||
|
||||
embeddings = tbl.search().where("val > 5").to_arrow()["vector"].to_pylist()
|
||||
assert not np.any(embeddings), "all zeros"
|
||||
|
||||
|
||||
def test_multiple_embeddings_for_pandas(tmp_path):
|
||||
@register("mock-embedding")
|
||||
class MockFunc1(TextEmbeddingFunction):
|
||||
def ndims(self):
|
||||
return 128
|
||||
|
||||
def generate_embeddings(
|
||||
self, texts: Union[List[str], np.ndarray]
|
||||
) -> List[np.array]:
|
||||
return [np.random.randn(self.ndims()).tolist() for _ in range(len(texts))]
|
||||
|
||||
@register("mock-embedding2")
|
||||
class MockFunc2(TextEmbeddingFunction):
|
||||
def ndims(self):
|
||||
return 512
|
||||
|
||||
def generate_embeddings(
|
||||
self, texts: Union[List[str], np.ndarray]
|
||||
) -> List[np.array]:
|
||||
return [np.random.randn(self.ndims()).tolist() for _ in range(len(texts))]
|
||||
|
||||
registery = get_registry()
|
||||
func1 = registery.get("mock-embedding").create()
|
||||
func2 = registery.get("mock-embedding2").create()
|
||||
|
||||
class TestSchema(LanceModel):
|
||||
text: str = func1.SourceField()
|
||||
val: int
|
||||
vec1: Vector(func1.ndims()) = func1.VectorField()
|
||||
prompt: str = func2.SourceField()
|
||||
vec2: Vector(func2.ndims()) = func2.VectorField()
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"text": ["hello world", "goodbye world"],
|
||||
"val": [1, 2],
|
||||
"prompt": ["hello", "goodbye"],
|
||||
}
|
||||
)
|
||||
db = lancedb.connect(tmp_path)
|
||||
tbl = db.create_table("test", schema=TestSchema, mode="overwrite", data=df)
|
||||
|
||||
schema = tbl.schema
|
||||
assert schema.field("text").type == pa.string()
|
||||
assert schema.field("val").type == pa.int64()
|
||||
assert schema.field("vec1").type == pa.list_(pa.float32(), 128)
|
||||
assert schema.field("prompt").type == pa.string()
|
||||
assert schema.field("vec2").type == pa.list_(pa.float32(), 512)
|
||||
assert tbl.count_rows() == 2
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_embedding_function_rate_limit(tmp_path):
|
||||
def _get_schema_from_model(model):
|
||||
|
||||
Reference in New Issue
Block a user