mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-19 13:00:40 +00:00
feat(python): add option to flatten output in to_pandas (#722)
Closes https://github.com/lancedb/lance/issues/1738 We add a `flatten` parameter to the signature of `to_pandas`. By default this is None and does nothing. If set to True or -1, then LanceDB will flatten structs before converting to a pandas dataframe. All nested structs are also flattened. If set to any positive integer, then LanceDB will flatten structs up to the specified level of nesting. --------- Co-authored-by: Weston Pace <weston.pace@gmail.com>
This commit is contained in:
@@ -2,3 +2,4 @@ mkdocs==1.4.2
|
||||
mkdocs-jupyter==0.24.1
|
||||
mkdocs-material==9.1.3
|
||||
mkdocstrings[python]==0.20.0
|
||||
pydantic
|
||||
@@ -118,4 +118,101 @@ However, fast vector search using indices often entails making a trade-off with
|
||||
This is why it is often called **Approximate Nearest Neighbors (ANN)** search, while the Flat Search (KNN)
|
||||
always returns 100% recall.
|
||||
|
||||
See [ANN Index](ann_indexes.md) for more details.
|
||||
See [ANN Index](ann_indexes.md) for more details.
|
||||
|
||||
|
||||
### Output formats
|
||||
|
||||
LanceDB returns results in many different formats commonly used in python.
|
||||
Let's create a LanceDB table with a nested schema:
|
||||
|
||||
```python
|
||||
from datetime import datetime
|
||||
import lancedb
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
import numpy as np
|
||||
from pydantic import BaseModel
|
||||
uri = "data/sample-lancedb-nested"
|
||||
|
||||
class Metadata(BaseModel):
|
||||
source: str
|
||||
timestamp: datetime
|
||||
|
||||
class Document(BaseModel):
|
||||
content: str
|
||||
meta: Metadata
|
||||
|
||||
class LanceSchema(LanceModel):
|
||||
id: str
|
||||
vector: Vector(1536)
|
||||
payload: Document
|
||||
|
||||
# Let's add 100 sample rows to our dataset
|
||||
data = [LanceSchema(
|
||||
id=f"id{i}",
|
||||
vector=np.random.randn(1536),
|
||||
payload=Document(
|
||||
content=f"document{i}", meta=Metadata(source=f"source{i%10}", timestamp=datetime.now())
|
||||
),
|
||||
) for i in range(100)]
|
||||
|
||||
tbl = db.create_table("documents", data=data)
|
||||
```
|
||||
|
||||
#### As a pyarrow table
|
||||
|
||||
Using `to_arrow()` we can get the results back as a pyarrow Table.
|
||||
This result table has the same columns as the LanceDB table, with
|
||||
the addition of an `_distance` column for vector search or a `score`
|
||||
column for full text search.
|
||||
|
||||
```python
|
||||
tbl.search(np.random.randn(1536)).to_arrow()
|
||||
```
|
||||
|
||||
#### As a pandas dataframe
|
||||
|
||||
You can also get the results as a pandas dataframe.
|
||||
|
||||
```python
|
||||
tbl.search(np.random.randn(1536)).to_pandas()
|
||||
```
|
||||
|
||||
While other formats like Arrow/Pydantic/Python dicts have a natural
|
||||
way to handle nested schemas, pandas can only store nested data as a
|
||||
python dict column, which makes it difficult to support nested references.
|
||||
So for convenience, you can also tell LanceDB to flatten a nested schema
|
||||
when creating the pandas dataframe.
|
||||
|
||||
```python
|
||||
tbl.search(np.random.randn(1536)).to_pandas(flatten=True)
|
||||
```
|
||||
|
||||
If your table has a deeply nested struct, you can control how many levels
|
||||
of nesting to flatten by passing in a positive integer.
|
||||
|
||||
```python
|
||||
tbl.search(np.random.randn(1536)).to_pandas(flatten=1)
|
||||
```
|
||||
|
||||
|
||||
#### As a list of python dicts
|
||||
|
||||
You can of course return results as a list of python dicts.
|
||||
|
||||
```python
|
||||
tbl.search(np.random.randn(1536)).to_list()
|
||||
```
|
||||
|
||||
#### As a list of pydantic models
|
||||
|
||||
We can add data using pydantic models, and we can certainly
|
||||
retrieve results as pydantic models
|
||||
|
||||
```python
|
||||
tbl.search(np.random.randn(1536)).to_pydantic(LanceSchema)
|
||||
```
|
||||
|
||||
Note that in this case the extra `_distance` field is discarded since
|
||||
it's not part of the LanceSchema.
|
||||
|
||||
|
||||
@@ -185,14 +185,40 @@ class LanceQueryBuilder(ABC):
|
||||
"""
|
||||
return self.to_pandas()
|
||||
|
||||
def to_pandas(self) -> "pd.DataFrame":
|
||||
def to_pandas(self, flatten: Optional[Union[int, bool]] = None) -> "pd.DataFrame":
|
||||
"""
|
||||
Execute the query and return the results as a pandas DataFrame.
|
||||
In addition to the selected columns, LanceDB also returns a vector
|
||||
and also the "_distance" column which is the distance between the query
|
||||
vector and the returned vector.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
flatten: Optional[Union[int, bool]]
|
||||
If flatten is True, flatten all nested columns.
|
||||
If flatten is an integer, flatten the nested columns up to the
|
||||
specified depth.
|
||||
If unspecified, do not flatten the nested columns.
|
||||
"""
|
||||
return self.to_arrow().to_pandas()
|
||||
tbl = self.to_arrow()
|
||||
if flatten is True:
|
||||
while True:
|
||||
tbl = tbl.flatten()
|
||||
has_struct = False
|
||||
# loop through all columns to check if there is any struct column
|
||||
if any(pa.types.is_struct(col.type) for col in tbl.schema):
|
||||
continue
|
||||
else:
|
||||
break
|
||||
elif isinstance(flatten, int):
|
||||
if flatten <= 0:
|
||||
raise ValueError(
|
||||
"Please specify a positive integer for flatten or the boolean value `True`"
|
||||
)
|
||||
while flatten > 0:
|
||||
tbl = tbl.flatten()
|
||||
flatten -= 1
|
||||
return tbl.to_pandas()
|
||||
|
||||
@abstractmethod
|
||||
def to_arrow(self) -> pa.Table:
|
||||
|
||||
@@ -144,9 +144,13 @@ def test_add(db):
|
||||
def test_add_pydantic_model(db):
|
||||
# https://github.com/lancedb/lancedb/issues/562
|
||||
|
||||
class Metadata(BaseModel):
|
||||
source: str
|
||||
timestamp: datetime
|
||||
|
||||
class Document(BaseModel):
|
||||
content: str
|
||||
source: str
|
||||
meta: Metadata
|
||||
|
||||
class LanceSchema(LanceModel):
|
||||
id: str
|
||||
@@ -162,13 +166,21 @@ def test_add_pydantic_model(db):
|
||||
id="id",
|
||||
vector=[0.0, 0.0],
|
||||
li=[1, 2, 3],
|
||||
payload=Document(content="foo", source="bar"),
|
||||
payload=Document(
|
||||
content="foo", meta=Metadata(source="bar", timestamp=datetime.now())
|
||||
),
|
||||
)
|
||||
tbl.add([expected])
|
||||
|
||||
result = tbl.search([0.0, 0.0]).limit(1).to_pydantic(LanceSchema)[0]
|
||||
assert result == expected
|
||||
|
||||
flattened = tbl.search([0.0, 0.0]).limit(1).to_pandas(flatten=1)
|
||||
assert len(flattened.columns) == 6 # _distance is automatically added
|
||||
|
||||
really_flattened = tbl.search([0.0, 0.0]).limit(1).to_pandas(flatten=True)
|
||||
assert len(really_flattened.columns) == 7
|
||||
|
||||
|
||||
def _add(table, schema):
|
||||
# table = LanceTable(db, "test")
|
||||
|
||||
Reference in New Issue
Block a user