feat: add to_list and to_pandas api's (#556)

Add `to_list` to return query results as list of python dict (so we're
not too pandas-centric). Closes #555

Add `to_pandas` API and add deprecation warning on `to_df`. Closes #545

Co-authored-by: Chang She <chang@lancedb.com>
This commit is contained in:
Chang She
2023-10-11 12:18:55 -07:00
committed by GitHub
parent ababc3f8ec
commit e1ae2bcbd8
26 changed files with 125 additions and 71 deletions

View File

@@ -14,13 +14,13 @@
import importlib.metadata
from typing import Optional
__version__ = importlib.metadata.version("lancedb")
from .db import URI, DBConnection, LanceDBConnection
from .remote.db import RemoteDBConnection
from .schema import vector
from .utils import sentry_log
__version__ = importlib.metadata.version("lancedb")
def connect(
uri: URI,

View File

@@ -12,6 +12,9 @@
# limitations under the License.
from __future__ import annotations
import deprecation
from . import __version__
from .exceptions import MissingColumnError, MissingValueError
from .util import safe_import_pandas
@@ -43,7 +46,7 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer:
this how many tokens, but depending on the input data, it could be sentences,
paragraphs, messages, etc.
>>> contextualize(data).window(3).stride(1).text_col('token').to_df()
>>> contextualize(data).window(3).stride(1).text_col('token').to_pandas()
token document_id
0 The quick brown 1
1 quick brown fox 1
@@ -56,7 +59,7 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer:
8 dog I love 1
9 I love sandwiches 2
10 love sandwiches 2
>>> contextualize(data).window(7).stride(1).min_window_size(7).text_col('token').to_df()
>>> contextualize(data).window(7).stride(1).min_window_size(7).text_col('token').to_pandas()
token document_id
0 The quick brown fox jumped over the 1
1 quick brown fox jumped over the lazy 1
@@ -68,7 +71,7 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer:
``stride`` determines how many rows to skip between each window start. This can
be used to reduce the total number of windows generated.
>>> contextualize(data).window(4).stride(2).text_col('token').to_df()
>>> contextualize(data).window(4).stride(2).text_col('token').to_pandas()
token document_id
0 The quick brown fox 1
2 brown fox jumped over 1
@@ -81,7 +84,7 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer:
context windows that don't cross document boundaries. In this case, we can
pass ``document_id`` as the group by.
>>> contextualize(data).window(4).stride(2).text_col('token').groupby('document_id').to_df()
>>> contextualize(data).window(4).stride(2).text_col('token').groupby('document_id').to_pandas()
token document_id
0 The quick brown fox 1
2 brown fox jumped over 1
@@ -93,14 +96,14 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer:
This can be used to trim the last few context windows which have size less than
``min_window_size``. By default context windows of size 1 are skipped.
>>> contextualize(data).window(6).stride(3).text_col('token').groupby('document_id').to_df()
>>> contextualize(data).window(6).stride(3).text_col('token').groupby('document_id').to_pandas()
token document_id
0 The quick brown fox jumped over 1
3 fox jumped over the lazy dog 1
6 the lazy dog 1
9 I love sandwiches 2
>>> contextualize(data).window(6).stride(3).min_window_size(4).text_col('token').groupby('document_id').to_df()
>>> contextualize(data).window(6).stride(3).min_window_size(4).text_col('token').groupby('document_id').to_pandas()
token document_id
0 The quick brown fox jumped over 1
3 fox jumped over the lazy dog 1
@@ -176,7 +179,16 @@ class Contextualizer:
self._min_window_size = min_window_size
return self
@deprecation.deprecated(
deprecated_in="0.3.1",
removed_in="0.4.0",
current_version=__version__,
details="Use the bar function instead",
)
def to_df(self) -> "pd.DataFrame":
return self.to_pandas()
def to_pandas(self) -> "pd.DataFrame":
"""Create the context windows and return a DataFrame."""
if pd is None:
raise ImportError(

View File

@@ -16,10 +16,12 @@ from __future__ import annotations
from abc import ABC, abstractmethod
from typing import List, Literal, Optional, Type, Union
import deprecation
import numpy as np
import pyarrow as pa
import pydantic
from . import __version__
from .common import VECTOR_COLUMN_NAME
from .pydantic import LanceModel
from .util import safe_import_pandas
@@ -127,7 +129,24 @@ class LanceQueryBuilder(ABC):
self._columns = None
self._where = None
@deprecation.deprecated(
deprecated_in="0.3.1",
removed_in="0.4.0",
current_version=__version__,
details="Use the bar function instead",
)
def to_df(self) -> "pd.DataFrame":
"""
Deprecated alias for `to_pandas()`. Please use `to_pandas()` instead.
Execute the query and return the results as a pandas DataFrame.
In addition to the selected columns, LanceDB also returns a vector
and also the "_distance" column which is the distance between the query
vector and the returned vector.
"""
return self.to_pandas()
def to_pandas(self) -> "pd.DataFrame":
"""
Execute the query and return the results as a pandas DataFrame.
In addition to the selected columns, LanceDB also returns a vector
@@ -148,6 +167,16 @@ class LanceQueryBuilder(ABC):
"""
raise NotImplementedError
def to_list(self) -> List[dict]:
"""
Execute the query and return the results as a list of dictionaries.
Each list entry is a dictionary with the selected column names as keys,
or all table columns if `select` is not called. The vector and the "_distance"
fields are returned whether or not they're explicitly selected.
"""
return self.to_arrow().to_pylist()
def to_pydantic(self, model: Type[LanceModel]) -> List[LanceModel]:
"""Return the table as a list of pydantic models.
@@ -232,7 +261,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
... .where("b < 10")
... .select(["b"])
... .limit(2)
... .to_df())
... .to_pandas())
b vector _distance
0 6 [0.4, 0.4] 0.0
"""

View File

@@ -137,7 +137,7 @@ class Table(ABC):
Can query the table with [Table.search][lancedb.table.Table.search].
>>> table.search([0.4, 0.4]).select(["b"]).to_df()
>>> table.search([0.4, 0.4]).select(["b"]).to_pandas()
b vector _distance
0 4 [0.5, 1.3] 0.82
1 2 [1.1, 1.2] 1.13