[python] Use pydantic for embedding function persistence (#467)

1. Support persistent embedding function so users can just search using
query string
2. Add fixed size list conversion for multiple vector columns
3. Add support for empty query (just apply select/where/limit).
4. Refactor and simplify some of the data prep code

---------

Co-authored-by: Chang She <chang@lancedb.com>
Co-authored-by: Weston Pace <weston.pace@gmail.com>
This commit is contained in:
Chang She
2023-09-05 21:30:45 -07:00
committed by GitHub
parent 52fa7f5577
commit 9a9a73a65d
13 changed files with 815 additions and 192 deletions

View File

@@ -1,7 +1,10 @@
import os
import pyarrow as pa
import pytest
from lancedb.embeddings import EmbeddingFunctionModel, EmbeddingFunctionRegistry
# import lancedb so we don't have to in every example
@@ -14,3 +17,22 @@ def doctest_setup(monkeypatch, tmpdir):
monkeypatch.setitem(os.environ, "COLUMNS", "80")
# Work in a temporary directory
monkeypatch.chdir(tmpdir)
registry = EmbeddingFunctionRegistry.get_instance()
@registry.register()
class MockEmbeddingFunction(EmbeddingFunctionModel):
def __call__(self, data):
if isinstance(data, str):
data = [data]
elif isinstance(data, pa.ChunkedArray):
data = data.combine_chunks().to_pylist()
elif isinstance(data, pa.Array):
data = data.to_pylist()
return [self.embed(row) for row in data]
def embed(self, row):
return [float(hash(c)) for c in row[:10]]