[Python] Updated to_df implementation in Contextualizer class (#174)

Changes include: - Contexts of sizes less than window param to be included as well - Added optional threshold parameter to to_df in Contextualizer This should close #165 - If maintainers are satisfied with the implementation will add more examples and test cases and update the documentations as well. --------- Co-authored-by: Nithin PS <47279496+Nithinps021@users.noreply.github.com> Co-authored-by: Will Jones <willjones127@gmail.com>
2026-01-10 13:52:58 +00:00 · 2023-06-14 21:52:32 +05:30
parent d00f4e51d0
commit 6b5c046c3b
2 changed files with 140 additions and 21 deletions
--- a/python/lancedb/context.py
+++ b/python/lancedb/context.py
@@ -42,34 +42,38 @@ def contextualize(raw_df: pd.DataFrame) -> Contextualizer:
    paragraphs, messages, etc.

    >>> contextualize(data).window(3).stride(1).text_col('token').to_df()
-                  token  document_id
-    0   The quick brown            1
-    1   quick brown fox            1
-    2  brown fox jumped            1
-    3   fox jumped over            1
-    4   jumped over the            1
-    5     over the lazy            1
-    6      the lazy dog            1
-    7        lazy dog I            1
-    8        dog I love            1
-    >>> contextualize(data).window(7).stride(1).text_col('token').to_df()
+                    token  document_id
+    0     The quick brown            1
+    1     quick brown fox            1
+    2    brown fox jumped            1
+    3     fox jumped over            1
+    4     jumped over the            1
+    5       over the lazy            1
+    6        the lazy dog            1
+    7          lazy dog I            1
+    8          dog I love            1
+    9   I love sandwiches            2
+    10    love sandwiches            2
+    >>> contextualize(data).window(7).stride(1).min_window_size(7).text_col('token').to_df()
                                      token  document_id
    0   The quick brown fox jumped over the            1
    1  quick brown fox jumped over the lazy            1
    2    brown fox jumped over the lazy dog            1
    3        fox jumped over the lazy dog I            1
    4       jumped over the lazy dog I love            1
-
+    5   over the lazy dog I love sandwiches            1

    ``stride`` determines how many rows to skip between each window start. This can
    be used to reduce the total number of windows generated.

    >>> contextualize(data).window(4).stride(2).text_col('token').to_df()
-                       token  document_id
-    0    The quick brown fox            1
-    2  brown fox jumped over            1
-    4   jumped over the lazy            1
-    6         the lazy dog I            1
+                        token  document_id
+    0     The quick brown fox            1
+    2   brown fox jumped over            1
+    4    jumped over the lazy            1
+    6          the lazy dog I            1
+    8   dog I love sandwiches            1
+    10        love sandwiches            2

    ``groupby`` determines how to group the rows. For example, we would like to have
    context windows that don't cross document boundaries. In this case, we can
@@ -80,6 +84,25 @@ def contextualize(raw_df: pd.DataFrame) -> Contextualizer:
    0    The quick brown fox            1
    2  brown fox jumped over            1
    4   jumped over the lazy            1
+    6           the lazy dog            1
+    9      I love sandwiches            2
+
+    ``min_window_size`` determines the minimum size of the  context windows that are generated
+    This can be used to trim the last few context windows which have size less than
+    ``min_window_size``. By default context windows of size 1 are skipped.
+
+    >>> contextualize(data).window(6).stride(3).text_col('token').groupby('document_id').to_df()
+                                 token  document_id
+    0  The quick brown fox jumped over            1
+    3     fox jumped over the lazy dog            1
+    6                     the lazy dog            1
+    9                I love sandwiches            2
+
+    >>> contextualize(data).window(6).stride(3).min_window_size(4).text_col('token').groupby('document_id').to_df()
+                                 token  document_id
+    0  The quick brown fox jumped over            1
+    3     fox jumped over the lazy dog            1
+
    """
    return Contextualizer(raw_df)

@@ -92,6 +115,7 @@ class Contextualizer:
        self._groupby = None
        self._stride = None
        self._window = None
+        self._min_window_size = 2
        self._raw_df = raw_df

    def window(self, window: int) -> Contextualizer:
@@ -139,6 +163,17 @@ class Contextualizer:
        self._text_col = text_col
        return self

+    def min_window_size(self, min_window_size: int) -> Contextualizer:
+        """Set the (optional) min_window_size size for the context window.
+
+        Parameters
+        ----------
+        min_window_size: int
+            The min_window_size.
+        """
+        self._min_window_size = min_window_size
+        return self
+
    def to_df(self) -> pd.DataFrame:
        """Create the context windows and return a DataFrame."""

@@ -159,12 +194,19 @@ class Contextualizer:

        def process_group(grp):
            # For each group, create the text rolling window
+            # with values of size >= min_window_size
            text = grp[self._text_col].values
-            contexts = grp.iloc[: -self._window : self._stride, :].copy()
-            contexts[self._text_col] = [
-                " ".join(text[start_i : start_i + self._window])
-                for start_i in range(0, len(grp) - self._window, self._stride)
+            contexts = grp.iloc[:: self._stride, :].copy()
+            windows = [
+                " ".join(text[start_i : min(start_i + self._window, len(grp))])
+                for start_i in range(0, len(grp), self._stride)
+                if start_i + self._window <= len(grp)
+                or len(grp) - start_i >= self._min_window_size
            ]
+            # if last few rows dropped
+            if len(windows) < len(contexts):
+                contexts = contexts.iloc[: len(windows)]
+            contexts[self._text_col] = windows
            return contexts

        if self._groupby is None: