diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 223f20d1..7d74cc60 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -60,6 +60,9 @@ nav: - Python integrations: - Pandas and PyArrow: python/arrow.md - DuckDB: python/duckdb.md + - LangChain 🦜️🔗: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lancedb.html + - LlamaIndex 🦙: https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html + - Pydantic: python/pydantic.md - Python examples: - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb diff --git a/docs/src/python/pydantic.md b/docs/src/python/pydantic.md new file mode 100644 index 00000000..92beaf76 --- /dev/null +++ b/docs/src/python/pydantic.md @@ -0,0 +1,35 @@ +# Pydantic + +[Pydantic](https://docs.pydantic.dev/latest/) is a data validation library in Python. + +## Schema + +LanceDB supports to create Apache Arrow Schema from a +[Pydantic BaseModel](https://docs.pydantic.dev/latest/api/main/#pydantic.main.BaseModel) +via [pydantic_to_schema()](python.md##lancedb.pydantic.pydantic_to_schema) method. + +::: lancedb.pydantic.pydantic_to_schema + +## Vector Field + +LanceDB provides a [`vector(dim)`](python.md#lancedb.pydantic.vector) method to define a +vector Field in a Pydantic Model. + +::: lancedb.pydantic.vector + +## Type Conversion + +LanceDB automatically convert Pydantic fields to +[Apache Arrow DataType](https://arrow.apache.org/docs/python/generated/pyarrow.DataType.html#pyarrow.DataType). + +Current supported type conversions: + +| Pydantic Field Type | PyArrow Data Type | +| ------------------- | ----------------- | +| `int` | `pyarrow.int64` | +| `float` | `pyarrow.float64` | +| `bool` | `pyarrow.bool` | +| `str` | `pyarrow.utf8()` | +| `list` | `pyarrow.List` | +| `BaseModel` | `pyarrow.Struct` | +| `vector(n)` | `pyarrow.FixedSizeList(float32, n)` | diff --git a/python/lancedb/pydantic.py b/python/lancedb/pydantic.py index 31cf2d5b..64bc02cc 100644 --- a/python/lancedb/pydantic.py +++ b/python/lancedb/pydantic.py @@ -41,9 +41,15 @@ def vector( ) -> Type[FixedSizeListMixin]: """Pydantic Vector Type. - Note - ---- - Experimental feature. + !!! warning + Experimental feature. + + Parameters + ---------- + dim : int + The dimension of the vector. + value_type : pyarrow.DataType, optional + The value type of the vector, by default pa.float32() Examples -------- @@ -52,9 +58,15 @@ def vector( >>> from lancedb.pydantic import vector ... >>> class MyModel(pydantic.BaseModel): - ... vector: vector(756) ... id: int - ... description: str + ... url: str + ... embeddings: vector(768) + >>> schema = pydantic_to_schema(MyModel) + >>> assert schema == pa.schema([ + ... pa.field("id", pa.int64(), False), + ... pa.field("url", pa.utf8(), False), + ... pa.field("embeddings", pa.list_(pa.float32(), 768), False) + ... ]) """ # TODO: make a public parameterized type. @@ -163,7 +175,36 @@ def pydantic_to_schema(model: Type[pydantic.BaseModel]) -> pa.Schema: Returns ------- - A PyArrow Schema. + pyarrow.Schema + + Examples + -------- + + >>> from typing import List, Optional + >>> import pydantic + >>> from lancedb.pydantic import pydantic_to_schema + ... + >>> class InnerModel(pydantic.BaseModel): + ... a: str + ... b: Optional[float] + >>> + >>> class FooModel(pydantic.BaseModel): + ... id: int + ... s: Optional[str] = None + ... vec: List[float] + ... li: List[int] + ... inner: InnerModel + >>> schema = pydantic_to_schema(FooModel) + >>> assert schema == pa.schema([ + ... pa.field("id", pa.int64(), False), + ... pa.field("s", pa.utf8(), True), + ... pa.field("vec", pa.list_(pa.float64()), False), + ... pa.field("li", pa.list_(pa.int64()), False), + ... pa.field("inner", pa.struct([ + ... pa.field("a", pa.utf8(), False), + ... pa.field("b", pa.float64(), True), + ... ]), False), + ... ]) """ fields = _pydantic_model_to_fields(model) return pa.schema(fields)