diff --git a/docs/requirements.txt b/docs/requirements.txt index c2e2ab41..ba139c65 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,3 +2,4 @@ mkdocs==1.4.2 mkdocs-jupyter==0.24.1 mkdocs-material==9.1.3 mkdocstrings[python]==0.20.0 +pydantic \ No newline at end of file diff --git a/docs/src/search.md b/docs/src/search.md index a25b0534..7d8dbbbf 100644 --- a/docs/src/search.md +++ b/docs/src/search.md @@ -118,4 +118,101 @@ However, fast vector search using indices often entails making a trade-off with This is why it is often called **Approximate Nearest Neighbors (ANN)** search, while the Flat Search (KNN) always returns 100% recall. -See [ANN Index](ann_indexes.md) for more details. \ No newline at end of file +See [ANN Index](ann_indexes.md) for more details. + + +### Output formats + +LanceDB returns results in many different formats commonly used in python. +Let's create a LanceDB table with a nested schema: + +```python +from datetime import datetime +import lancedb +from lancedb.pydantic import LanceModel, Vector +import numpy as np +from pydantic import BaseModel +uri = "data/sample-lancedb-nested" + +class Metadata(BaseModel): + source: str + timestamp: datetime + +class Document(BaseModel): + content: str + meta: Metadata + +class LanceSchema(LanceModel): + id: str + vector: Vector(1536) + payload: Document + +# Let's add 100 sample rows to our dataset +data = [LanceSchema( + id=f"id{i}", + vector=np.random.randn(1536), + payload=Document( + content=f"document{i}", meta=Metadata(source=f"source{i%10}", timestamp=datetime.now()) + ), +) for i in range(100)] + +tbl = db.create_table("documents", data=data) +``` + +#### As a pyarrow table + +Using `to_arrow()` we can get the results back as a pyarrow Table. +This result table has the same columns as the LanceDB table, with +the addition of an `_distance` column for vector search or a `score` +column for full text search. + +```python +tbl.search(np.random.randn(1536)).to_arrow() +``` + +#### As a pandas dataframe + +You can also get the results as a pandas dataframe. + +```python +tbl.search(np.random.randn(1536)).to_pandas() +``` + +While other formats like Arrow/Pydantic/Python dicts have a natural +way to handle nested schemas, pandas can only store nested data as a +python dict column, which makes it difficult to support nested references. +So for convenience, you can also tell LanceDB to flatten a nested schema +when creating the pandas dataframe. + +```python +tbl.search(np.random.randn(1536)).to_pandas(flatten=True) +``` + +If your table has a deeply nested struct, you can control how many levels +of nesting to flatten by passing in a positive integer. + +```python +tbl.search(np.random.randn(1536)).to_pandas(flatten=1) +``` + + +#### As a list of python dicts + +You can of course return results as a list of python dicts. + +```python +tbl.search(np.random.randn(1536)).to_list() +``` + +#### As a list of pydantic models + +We can add data using pydantic models, and we can certainly +retrieve results as pydantic models + +```python +tbl.search(np.random.randn(1536)).to_pydantic(LanceSchema) +``` + +Note that in this case the extra `_distance` field is discarded since +it's not part of the LanceSchema. + diff --git a/python/lancedb/query.py b/python/lancedb/query.py index c73e82f8..fe2dc86c 100644 --- a/python/lancedb/query.py +++ b/python/lancedb/query.py @@ -185,14 +185,40 @@ class LanceQueryBuilder(ABC): """ return self.to_pandas() - def to_pandas(self) -> "pd.DataFrame": + def to_pandas(self, flatten: Optional[Union[int, bool]] = None) -> "pd.DataFrame": """ Execute the query and return the results as a pandas DataFrame. In addition to the selected columns, LanceDB also returns a vector and also the "_distance" column which is the distance between the query vector and the returned vector. + + Parameters + ---------- + flatten: Optional[Union[int, bool]] + If flatten is True, flatten all nested columns. + If flatten is an integer, flatten the nested columns up to the + specified depth. + If unspecified, do not flatten the nested columns. """ - return self.to_arrow().to_pandas() + tbl = self.to_arrow() + if flatten is True: + while True: + tbl = tbl.flatten() + has_struct = False + # loop through all columns to check if there is any struct column + if any(pa.types.is_struct(col.type) for col in tbl.schema): + continue + else: + break + elif isinstance(flatten, int): + if flatten <= 0: + raise ValueError( + "Please specify a positive integer for flatten or the boolean value `True`" + ) + while flatten > 0: + tbl = tbl.flatten() + flatten -= 1 + return tbl.to_pandas() @abstractmethod def to_arrow(self) -> pa.Table: diff --git a/python/tests/test_table.py b/python/tests/test_table.py index cbe641cd..3ed1eede 100644 --- a/python/tests/test_table.py +++ b/python/tests/test_table.py @@ -144,9 +144,13 @@ def test_add(db): def test_add_pydantic_model(db): # https://github.com/lancedb/lancedb/issues/562 + class Metadata(BaseModel): + source: str + timestamp: datetime + class Document(BaseModel): content: str - source: str + meta: Metadata class LanceSchema(LanceModel): id: str @@ -162,13 +166,21 @@ def test_add_pydantic_model(db): id="id", vector=[0.0, 0.0], li=[1, 2, 3], - payload=Document(content="foo", source="bar"), + payload=Document( + content="foo", meta=Metadata(source="bar", timestamp=datetime.now()) + ), ) tbl.add([expected]) result = tbl.search([0.0, 0.0]).limit(1).to_pydantic(LanceSchema)[0] assert result == expected + flattened = tbl.search([0.0, 0.0]).limit(1).to_pandas(flatten=1) + assert len(flattened.columns) == 6 # _distance is automatically added + + really_flattened = tbl.search([0.0, 0.0]).limit(1).to_pandas(flatten=True) + assert len(really_flattened.columns) == 7 + def _add(table, schema): # table = LanceTable(db, "test")