[Python] Updated to_df implementation in Contextualizer class (#174)

Changes include: - Contexts of sizes less than window param to be included as well - Added optional threshold parameter to to_df in Contextualizer This should close #165 - If maintainers are satisfied with the implementation will add more examples and test cases and update the documentations as well. --------- Co-authored-by: Nithin PS <47279496+Nithinps021@users.noreply.github.com> Co-authored-by: Will Jones <willjones127@gmail.com>
2026-05-21 14:00:40 +00:00 · 2023-06-14 21:52:32 +05:30
parent d00f4e51d0
commit 6b5c046c3b
2 changed files with 140 additions and 21 deletions
--- a/python/tests/test_context.py
+++ b/python/tests/test_context.py
@@ -0,0 +1,77 @@
+#  Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import pandas as pd
+import pytest
+
+from lancedb.context import contextualize
+
+
+@pytest.fixture
+def raw_df() -> pd.DataFrame:
+    return pd.DataFrame(
+        {
+            "token": [
+                "The",
+                "quick",
+                "brown",
+                "fox",
+                "jumped",
+                "over",
+                "the",
+                "lazy",
+                "dog",
+                "I",
+                "love",
+                "sandwiches",
+            ],
+            "document_id": [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2],
+        }
+    )
+
+
+def test_contextualizer(raw_df: pd.DataFrame):
+    result = (
+        contextualize(raw_df)
+        .window(6)
+        .stride(3)
+        .text_col("token")
+        .groupby("document_id")
+        .to_df()["token"]
+        .to_list()
+    )
+
+    assert result == [
+        "The quick brown fox jumped over",
+        "fox jumped over the lazy dog",
+        "the lazy dog",
+        "I love sandwiches",
+    ]
+
+
+def test_contextualizer_with_threshold(raw_df: pd.DataFrame):
+    result = (
+        contextualize(raw_df)
+        .window(6)
+        .stride(3)
+        .text_col("token")
+        .groupby("document_id")
+        .min_window_size(4)
+        .to_df()["token"]
+        .to_list()
+    )
+
+    assert result == [
+        "The quick brown fox jumped over",
+        "fox jumped over the lazy dog",
+    ]