diff --git a/docs/src/guides/tables.md b/docs/src/guides/tables.md index 05cd6c25..5808e49a 100644 --- a/docs/src/guides/tables.md +++ b/docs/src/guides/tables.md @@ -118,6 +118,84 @@ This guide will show how to create tables, insert data into them, and update the table = db.create_table(table_name, schema=Content) ``` + #### Nested schemas + + Sometimes your data model may contain nested objects. + For example, you may want to store the document string + and the document soure name as a nested Document object: + + ```python + class Document(BaseModel): + content: str + source: str + ``` + + This can be used as the type of a LanceDB table column: + + ```python + class NestedSchema(LanceModel): + id: str + vector: Vector(1536) + document: Document + + tbl = db.create_table("nested_table", schema=NestedSchema, mode="overwrite") + ``` + + This creates a struct column called "document" that has two subfields + called "content" and "source": + + ``` + In [28]: tbl.schema + Out[28]: + id: string not null + vector: fixed_size_list[1536] not null + child 0, item: float + document: struct not null + child 0, content: string not null + child 1, source: string not null + ``` + + #### Validators + + Note that neither pydantic nor pyarrow automatically validates that input data + is of the *correct* timezone, but this is easy to add as a custom field validator: + + ```python + from datetime import datetime + from zoneinfo import ZoneInfo + + from lancedb.pydantic import LanceModel + from pydantic import Field, field_validator, ValidationError, ValidationInfo + + tzname = "America/New_York" + tz = ZoneInfo(tzname) + + class TestModel(LanceModel): + dt_with_tz: datetime = Field(json_schema_extra={"tz": tzname}) + + @field_validator('dt_with_tz') + @classmethod + def tz_must_match(cls, dt: datetime) -> datetime: + assert dt.tzinfo == tz + return dt + + ok = TestModel(dt_with_tz=datetime.now(tz)) + + try: + TestModel(dt_with_tz=datetime.now(ZoneInfo("Asia/Shanghai"))) + assert 0 == 1, "this should raise ValidationError" + except ValidationError: + print("A ValidationError was raised.") + pass + ``` + + When you run this code it should print "A ValidationError was raised." + + #### Pydantic custom types + + LanceDB does NOT yet support converting pydantic custom types. If this is something you need, + please file a feature request on the [LanceDB Github repo](https://github.com/lancedb/lancedb/issues/new). + ### Using Iterators / Writing Large Datasets It is recommended to use itertators to add large datasets in batches when creating your table in one go. This does not create multiple versions of your dataset unlike manually adding batches using `table.add()` @@ -153,7 +231,7 @@ This guide will show how to create tables, insert data into them, and update the You can also use iterators of other types like Pandas dataframe or Pylists directly in the above example. ## Creating Empty Table - You can also create empty tables in python. Initialize it with schema and later ingest data into it. + You can create empty tables in python. Initialize it with schema and later ingest data into it. ```python import lancedb diff --git a/python/lancedb/pydantic.py b/python/lancedb/pydantic.py index 537d60a0..48a67189 100644 --- a/python/lancedb/pydantic.py +++ b/python/lancedb/pydantic.py @@ -26,6 +26,7 @@ import numpy as np import pyarrow as pa import pydantic import semver +from pydantic.fields import FieldInfo from .embeddings import EmbeddingFunctionRegistry @@ -142,8 +143,8 @@ def Vector( return FixedSizeList -def _py_type_to_arrow_type(py_type: Type[Any]) -> pa.DataType: - """Convert Python Type to Arrow DataType. +def _py_type_to_arrow_type(py_type: Type[Any], field: FieldInfo) -> pa.DataType: + """Convert a field with native Python type to Arrow data type. Raises ------ @@ -163,12 +164,13 @@ def _py_type_to_arrow_type(py_type: Type[Any]) -> pa.DataType: elif py_type == date: return pa.date32() elif py_type == datetime: - return pa.timestamp("us") - elif py_type.__origin__ in (list, tuple): + tz = get_extras(field, "tz") + return pa.timestamp("us", tz=tz) + elif getattr(py_type, "__origin__", None) in (list, tuple): child = py_type.__args__[0] - return pa.list_(_py_type_to_arrow_type(child)) + return pa.list_(_py_type_to_arrow_type(child, field)) raise TypeError( - f"Converting Pydantic type to Arrow Type: unsupported type {py_type}" + f"Converting Pydantic type to Arrow Type: unsupported type {py_type}." ) @@ -197,10 +199,10 @@ def _pydantic_to_arrow_type(field: pydantic.fields.FieldInfo) -> pa.DataType: args = field.annotation.__args__ if origin == list: child = args[0] - return pa.list_(_py_type_to_arrow_type(child)) + return pa.list_(_py_type_to_arrow_type(child, field)) elif origin == Union: if len(args) == 2 and args[1] == type(None): - return _py_type_to_arrow_type(args[0]) + return _py_type_to_arrow_type(args[0], field) elif inspect.isclass(field.annotation): if issubclass(field.annotation, pydantic.BaseModel): # Struct @@ -208,7 +210,7 @@ def _pydantic_to_arrow_type(field: pydantic.fields.FieldInfo) -> pa.DataType: return pa.struct(fields) elif issubclass(field.annotation, FixedSizeListMixin): return pa.list_(field.annotation.value_arrow_type(), field.annotation.dim()) - return _py_type_to_arrow_type(field.annotation) + return _py_type_to_arrow_type(field.annotation, field) def is_nullable(field: pydantic.fields.FieldInfo) -> bool: diff --git a/python/pyproject.toml b/python/pyproject.toml index e5c63177..b56e43c7 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -46,7 +46,7 @@ classifiers = [ repository = "https://github.com/lancedb/lancedb" [project.optional-dependencies] -tests = ["pandas>=1.4", "pytest", "pytest-mock", "pytest-asyncio", "requests", "duckdb"] +tests = ["pandas>=1.4", "pytest", "pytest-mock", "pytest-asyncio", "requests", "duckdb", "pytz"] dev = ["ruff", "pre-commit", "black"] docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"] clip = ["torch", "pillow", "open-clip"] diff --git a/python/tests/test_pydantic.py b/python/tests/test_pydantic.py index e6739032..8a3ee16b 100644 --- a/python/tests/test_pydantic.py +++ b/python/tests/test_pydantic.py @@ -13,6 +13,7 @@ import json +import pytz import sys from datetime import date, datetime from typing import List, Optional, Tuple @@ -38,13 +39,14 @@ def test_pydantic_to_arrow(): id: int s: str vec: list[float] - li: List[int] - lili: List[List[float]] - litu: List[Tuple[float, float]] + li: list[int] + lili: list[list[float]] + litu: list[tuple[float, float]] opt: Optional[str] = None st: StructModel dt: date dtt: datetime + dt_with_tz: datetime = Field(json_schema_extra={"tz": "Asia/Shanghai"}) # d: dict m = TestModel( @@ -57,6 +59,7 @@ def test_pydantic_to_arrow(): st=StructModel(a="a", b=1.0), dt=date.today(), dtt=datetime.now(), + dt_with_tz=datetime.now(pytz.timezone("Asia/Shanghai")), ) schema = pydantic_to_schema(TestModel) @@ -79,11 +82,16 @@ def test_pydantic_to_arrow(): ), pa.field("dt", pa.date32(), False), pa.field("dtt", pa.timestamp("us"), False), + pa.field("dt_with_tz", pa.timestamp("us", tz="Asia/Shanghai"), False), ] ) assert schema == expect_schema +@pytest.mark.skipif( + sys.version_info > (3, 8), + reason="using native type alias requires python3.9 or higher", +) def test_pydantic_to_arrow_py38(): class StructModel(pydantic.BaseModel): a: str @@ -100,6 +108,7 @@ def test_pydantic_to_arrow_py38(): st: StructModel dt: date dtt: datetime + dt_with_tz: datetime = Field(json_schema_extra={"tz": "Asia/Shanghai"}) # d: dict m = TestModel( @@ -112,6 +121,7 @@ def test_pydantic_to_arrow_py38(): st=StructModel(a="a", b=1.0), dt=date.today(), dtt=datetime.now(), + dt_with_tz=datetime.now(pytz.timezone("Asia/Shanghai")), ) schema = pydantic_to_schema(TestModel) @@ -134,6 +144,7 @@ def test_pydantic_to_arrow_py38(): ), pa.field("dt", pa.date32(), False), pa.field("dtt", pa.timestamp("us"), False), + pa.field("dt_with_tz", pa.timestamp("us", tz="Asia/Shanghai"), False), ] ) assert schema == expect_schema