mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-10 13:52:58 +00:00
[Python] Updated to_df implementation in Contextualizer class (#174)
Changes include: - Contexts of sizes less than window param to be included as well - Added optional threshold parameter to to_df in Contextualizer This should close #165 - If maintainers are satisfied with the implementation will add more examples and test cases and update the documentations as well. --------- Co-authored-by: Nithin PS <47279496+Nithinps021@users.noreply.github.com> Co-authored-by: Will Jones <willjones127@gmail.com>
This commit is contained in:
@@ -42,34 +42,38 @@ def contextualize(raw_df: pd.DataFrame) -> Contextualizer:
|
||||
paragraphs, messages, etc.
|
||||
|
||||
>>> contextualize(data).window(3).stride(1).text_col('token').to_df()
|
||||
token document_id
|
||||
0 The quick brown 1
|
||||
1 quick brown fox 1
|
||||
2 brown fox jumped 1
|
||||
3 fox jumped over 1
|
||||
4 jumped over the 1
|
||||
5 over the lazy 1
|
||||
6 the lazy dog 1
|
||||
7 lazy dog I 1
|
||||
8 dog I love 1
|
||||
>>> contextualize(data).window(7).stride(1).text_col('token').to_df()
|
||||
token document_id
|
||||
0 The quick brown 1
|
||||
1 quick brown fox 1
|
||||
2 brown fox jumped 1
|
||||
3 fox jumped over 1
|
||||
4 jumped over the 1
|
||||
5 over the lazy 1
|
||||
6 the lazy dog 1
|
||||
7 lazy dog I 1
|
||||
8 dog I love 1
|
||||
9 I love sandwiches 2
|
||||
10 love sandwiches 2
|
||||
>>> contextualize(data).window(7).stride(1).min_window_size(7).text_col('token').to_df()
|
||||
token document_id
|
||||
0 The quick brown fox jumped over the 1
|
||||
1 quick brown fox jumped over the lazy 1
|
||||
2 brown fox jumped over the lazy dog 1
|
||||
3 fox jumped over the lazy dog I 1
|
||||
4 jumped over the lazy dog I love 1
|
||||
|
||||
5 over the lazy dog I love sandwiches 1
|
||||
|
||||
``stride`` determines how many rows to skip between each window start. This can
|
||||
be used to reduce the total number of windows generated.
|
||||
|
||||
>>> contextualize(data).window(4).stride(2).text_col('token').to_df()
|
||||
token document_id
|
||||
0 The quick brown fox 1
|
||||
2 brown fox jumped over 1
|
||||
4 jumped over the lazy 1
|
||||
6 the lazy dog I 1
|
||||
token document_id
|
||||
0 The quick brown fox 1
|
||||
2 brown fox jumped over 1
|
||||
4 jumped over the lazy 1
|
||||
6 the lazy dog I 1
|
||||
8 dog I love sandwiches 1
|
||||
10 love sandwiches 2
|
||||
|
||||
``groupby`` determines how to group the rows. For example, we would like to have
|
||||
context windows that don't cross document boundaries. In this case, we can
|
||||
@@ -80,6 +84,25 @@ def contextualize(raw_df: pd.DataFrame) -> Contextualizer:
|
||||
0 The quick brown fox 1
|
||||
2 brown fox jumped over 1
|
||||
4 jumped over the lazy 1
|
||||
6 the lazy dog 1
|
||||
9 I love sandwiches 2
|
||||
|
||||
``min_window_size`` determines the minimum size of the context windows that are generated
|
||||
This can be used to trim the last few context windows which have size less than
|
||||
``min_window_size``. By default context windows of size 1 are skipped.
|
||||
|
||||
>>> contextualize(data).window(6).stride(3).text_col('token').groupby('document_id').to_df()
|
||||
token document_id
|
||||
0 The quick brown fox jumped over 1
|
||||
3 fox jumped over the lazy dog 1
|
||||
6 the lazy dog 1
|
||||
9 I love sandwiches 2
|
||||
|
||||
>>> contextualize(data).window(6).stride(3).min_window_size(4).text_col('token').groupby('document_id').to_df()
|
||||
token document_id
|
||||
0 The quick brown fox jumped over 1
|
||||
3 fox jumped over the lazy dog 1
|
||||
|
||||
"""
|
||||
return Contextualizer(raw_df)
|
||||
|
||||
@@ -92,6 +115,7 @@ class Contextualizer:
|
||||
self._groupby = None
|
||||
self._stride = None
|
||||
self._window = None
|
||||
self._min_window_size = 2
|
||||
self._raw_df = raw_df
|
||||
|
||||
def window(self, window: int) -> Contextualizer:
|
||||
@@ -139,6 +163,17 @@ class Contextualizer:
|
||||
self._text_col = text_col
|
||||
return self
|
||||
|
||||
def min_window_size(self, min_window_size: int) -> Contextualizer:
|
||||
"""Set the (optional) min_window_size size for the context window.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
min_window_size: int
|
||||
The min_window_size.
|
||||
"""
|
||||
self._min_window_size = min_window_size
|
||||
return self
|
||||
|
||||
def to_df(self) -> pd.DataFrame:
|
||||
"""Create the context windows and return a DataFrame."""
|
||||
|
||||
@@ -159,12 +194,19 @@ class Contextualizer:
|
||||
|
||||
def process_group(grp):
|
||||
# For each group, create the text rolling window
|
||||
# with values of size >= min_window_size
|
||||
text = grp[self._text_col].values
|
||||
contexts = grp.iloc[: -self._window : self._stride, :].copy()
|
||||
contexts[self._text_col] = [
|
||||
" ".join(text[start_i : start_i + self._window])
|
||||
for start_i in range(0, len(grp) - self._window, self._stride)
|
||||
contexts = grp.iloc[:: self._stride, :].copy()
|
||||
windows = [
|
||||
" ".join(text[start_i : min(start_i + self._window, len(grp))])
|
||||
for start_i in range(0, len(grp), self._stride)
|
||||
if start_i + self._window <= len(grp)
|
||||
or len(grp) - start_i >= self._min_window_size
|
||||
]
|
||||
# if last few rows dropped
|
||||
if len(windows) < len(contexts):
|
||||
contexts = contexts.iloc[: len(windows)]
|
||||
contexts[self._text_col] = windows
|
||||
return contexts
|
||||
|
||||
if self._groupby is None:
|
||||
|
||||
Reference in New Issue
Block a user