From 92f0b16e46f343a66d539c20e88df64d586037bb Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Mon, 21 Apr 2025 13:42:13 -0700
Subject: [PATCH] fix(python): make sure pandas is optional (#2346)

Fixes #2344


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **Tests**
- Updated tests to use PyArrow Tables instead of pandas DataFrames where
possible, reducing reliance on pandas.
- Tests that require pandas are now automatically skipped if pandas is
not installed.
- **Chores**
- Improved workflow to uninstall both pylance and pandas in a specific
test step.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
---
 .github/workflows/python.yml      |  4 +-
 python/python/lancedb/common.py   |  4 +-
 python/python/tests/test_table.py | 73 +++++++++++++++++--------------
 3 files changed, 43 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 9fca789c..125336a7 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -136,9 +136,9 @@ jobs:
       - uses: ./.github/workflows/run_tests
         with:
           integration: true
-      - name: Test without pylance
+      - name: Test without pylance or pandas
         run: |
-          pip uninstall -y pylance
+          pip uninstall -y pylance pandas
           pytest -vv python/tests/test_table.py
       # Make sure wheels are not included in the Rust cache
       - name: Delete wheels
diff --git a/python/python/lancedb/common.py b/python/python/lancedb/common.py
index 253c51a3..a34078ee 100644
--- a/python/python/lancedb/common.py
+++ b/python/python/lancedb/common.py
@@ -9,7 +9,7 @@ import numpy as np
 import pyarrow as pa
 import pyarrow.dataset
 
-from .dependencies import pandas as pd
+from .dependencies import _check_for_pandas, pandas as pd
 
 DATA = Union[List[dict], "pd.DataFrame", pa.Table, Iterable[pa.RecordBatch]]
 VEC = Union[list, np.ndarray, pa.Array, pa.ChunkedArray]
@@ -63,7 +63,7 @@ def data_to_reader(
     data: DATA, schema: Optional[pa.Schema] = None
 ) -> pa.RecordBatchReader:
     """Convert various types of input into a RecordBatchReader"""
-    if pd is not None and isinstance(data, pd.DataFrame):
+    if _check_for_pandas(data) and isinstance(data, pd.DataFrame):
         return pa.Table.from_pandas(data, schema=schema).to_reader()
     elif isinstance(data, pa.Table):
         return data.to_reader()
diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py
index 2bb18989..6624de19 100644
--- a/python/python/tests/test_table.py
+++ b/python/python/tests/test_table.py
@@ -9,9 +9,9 @@ from typing import List
 from unittest.mock import patch
 
 import lancedb
+from lancedb.dependencies import _PANDAS_AVAILABLE
 from lancedb.index import HnswPq, HnswSq, IvfPq
 import numpy as np
-import pandas as pd
 import polars as pl
 import pyarrow as pa
 import pyarrow.dataset
@@ -138,13 +138,16 @@ def test_create_table(mem_db: DBConnection):
         {"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
         {"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
     ]
-    df = pd.DataFrame(rows)
-    pa_table = pa.Table.from_pandas(df, schema=schema)
+    pa_table = pa.Table.from_pylist(rows, schema=schema)
     data = [
         ("Rows", rows),
-        ("pd_DataFrame", df),
         ("pa_Table", pa_table),
     ]
+    if _PANDAS_AVAILABLE:
+        import pandas as pd
+
+        df = pd.DataFrame(rows)
+        data.append(("pd_DataFrame", df))
 
     for name, d in data:
         tbl = mem_db.create_table(name, data=d, schema=schema).to_arrow()
@@ -296,7 +299,7 @@ def test_add_subschema(mem_db: DBConnection):
 
     data = {"price": 10.0, "item": "foo"}
     table.add([data])
-    data = pd.DataFrame({"price": [2.0], "vector": [[3.1, 4.1]]})
+    data = pa.Table.from_pydict({"price": [2.0], "vector": [[3.1, 4.1]]})
     table.add(data)
     data = {"price": 3.0, "vector": [5.9, 26.5], "item": "bar"}
     table.add([data])
@@ -405,6 +408,7 @@ def test_add_nullability(mem_db: DBConnection):
 
 
 def test_add_pydantic_model(mem_db: DBConnection):
+    pytest.importorskip("pandas")
     # https://github.com/lancedb/lancedb/issues/562
 
     class Metadata(BaseModel):
@@ -473,10 +477,10 @@ def test_polars(mem_db: DBConnection):
     table = mem_db.create_table("test", data=pl.DataFrame(data))
     assert len(table) == 2
 
-    result = table.to_pandas()
-    assert np.allclose(result["vector"].tolist(), data["vector"])
-    assert result["item"].tolist() == data["item"]
-    assert np.allclose(result["price"].tolist(), data["price"])
+    result = table.to_arrow()
+    assert np.allclose(result["vector"].to_pylist(), data["vector"])
+    assert result["item"].to_pylist() == data["item"]
+    assert np.allclose(result["price"].to_pylist(), data["price"])
 
     schema = pa.schema(
         [
@@ -688,7 +692,7 @@ def test_delete(mem_db: DBConnection):
     assert len(table.list_versions()) == 2
     assert table.version == 2
     assert len(table) == 1
-    assert table.to_pandas()["id"].tolist() == [1]
+    assert table.to_arrow()["id"].to_pylist() == [1]
 
 
 def test_update(mem_db: DBConnection):
@@ -852,6 +856,7 @@ def test_merge_insert(mem_db: DBConnection):
     ids=["pa.Table", "pd.DataFrame", "rows"],
 )
 def test_merge_insert_subschema(mem_db: DBConnection, data_format):
+    pytest.importorskip("pandas")
     initial_data = pa.table(
         {"id": range(3), "a": [1.0, 2.0, 3.0], "c": ["x", "x", "x"]}
     )
@@ -948,7 +953,7 @@ def test_create_with_embedding_function(mem_db: DBConnection):
 
     func = MockTextEmbeddingFunction.create()
     texts = ["hello world", "goodbye world", "foo bar baz fizz buzz"]
-    df = pd.DataFrame({"text": texts, "vector": func.compute_source_embeddings(texts)})
+    df = pa.table({"text": texts, "vector": func.compute_source_embeddings(texts)})
 
     conf = EmbeddingFunctionConfig(
         source_column="text", vector_column="vector", function=func
@@ -973,7 +978,7 @@ def test_create_f16_table(mem_db: DBConnection):
         text: str
         vector: Vector(32, value_type=pa.float16())
 
-    df = pd.DataFrame(
+    df = pa.table(
         {
             "text": [f"s-{i}" for i in range(512)],
             "vector": [np.random.randn(32).astype(np.float16) for _ in range(512)],
@@ -986,7 +991,7 @@ def test_create_f16_table(mem_db: DBConnection):
     table.add(df)
     table.create_index(num_partitions=2, num_sub_vectors=2)
 
-    query = df.vector.iloc[2]
+    query = df["vector"][2].as_py()
     expected = table.search(query).limit(2).to_arrow()
 
     assert "s-2" in expected["text"].to_pylist()
@@ -1002,7 +1007,7 @@ def test_add_with_embedding_function(mem_db: DBConnection):
     table = mem_db.create_table("my_table", schema=MyTable)
 
     texts = ["hello world", "goodbye world", "foo bar baz fizz buzz"]
-    df = pd.DataFrame({"text": texts})
+    df = pa.table({"text": texts})
     table.add(df)
 
     texts = ["the quick brown fox", "jumped over the lazy dog"]
@@ -1033,14 +1038,14 @@ def test_multiple_vector_columns(mem_db: DBConnection):
         {"vector1": v1, "vector2": v2, "text": "foo"},
         {"vector1": v2, "vector2": v1, "text": "bar"},
     ]
-    df = pd.DataFrame(data)
+    df = pa.Table.from_pylist(data)
     table.add(df)
 
     q = np.random.randn(10)
-    result1 = table.search(q, vector_column_name="vector1").limit(1).to_pandas()
-    result2 = table.search(q, vector_column_name="vector2").limit(1).to_pandas()
+    result1 = table.search(q, vector_column_name="vector1").limit(1).to_arrow()
+    result2 = table.search(q, vector_column_name="vector2").limit(1).to_arrow()
 
-    assert result1["text"].iloc[0] != result2["text"].iloc[0]
+    assert result1["text"][0] != result2["text"][0]
 
 
 def test_create_scalar_index(mem_db: DBConnection):
@@ -1078,22 +1083,22 @@ def test_empty_query(mem_db: DBConnection):
         "my_table",
         data=[{"text": "foo", "id": 0}, {"text": "bar", "id": 1}],
     )
-    df = table.search().select(["id"]).where("text='bar'").limit(1).to_pandas()
-    val = df.id.iloc[0]
+    df = table.search().select(["id"]).where("text='bar'").limit(1).to_arrow()
+    val = df["id"][0].as_py()
     assert val == 1
 
     table = mem_db.create_table("my_table2", data=[{"id": i} for i in range(100)])
-    df = table.search().select(["id"]).to_pandas()
-    assert len(df) == 100
+    df = table.search().select(["id"]).to_arrow()
+    assert df.num_rows == 100
     # None is the same as default
-    df = table.search().select(["id"]).limit(None).to_pandas()
-    assert len(df) == 100
+    df = table.search().select(["id"]).limit(None).to_arrow()
+    assert df.num_rows == 100
     # invalid limist is the same as None, wihch is the same as default
-    df = table.search().select(["id"]).limit(-1).to_pandas()
-    assert len(df) == 100
+    df = table.search().select(["id"]).limit(-1).to_arrow()
+    assert df.num_rows == 100
     # valid limit should work
-    df = table.search().select(["id"]).limit(42).to_pandas()
-    assert len(df) == 42
+    df = table.search().select(["id"]).limit(42).to_arrow()
+    assert df.num_rows == 42
 
 
 def test_search_with_schema_inf_single_vector(mem_db: DBConnection):
@@ -1112,14 +1117,14 @@ def test_search_with_schema_inf_single_vector(mem_db: DBConnection):
         {"vector_col": v1, "text": "foo"},
         {"vector_col": v2, "text": "bar"},
     ]
-    df = pd.DataFrame(data)
+    df = pa.Table.from_pylist(data)
     table.add(df)
 
     q = np.random.randn(10)
-    result1 = table.search(q, vector_column_name="vector_col").limit(1).to_pandas()
-    result2 = table.search(q).limit(1).to_pandas()
+    result1 = table.search(q, vector_column_name="vector_col").limit(1).to_arrow()
+    result2 = table.search(q).limit(1).to_arrow()
 
-    assert result1["text"].iloc[0] == result2["text"].iloc[0]
+    assert result1["text"][0].as_py() == result2["text"][0].as_py()
 
 
 def test_search_with_schema_inf_multiple_vector(mem_db: DBConnection):
@@ -1139,12 +1144,12 @@ def test_search_with_schema_inf_multiple_vector(mem_db: DBConnection):
         {"vector1": v1, "vector2": v2, "text": "foo"},
         {"vector1": v2, "vector2": v1, "text": "bar"},
     ]
-    df = pd.DataFrame(data)
+    df = pa.Table.from_pylist(data)
     table.add(df)
 
     q = np.random.randn(10)
     with pytest.raises(ValueError):
-        table.search(q).limit(1).to_pandas()
+        table.search(q).limit(1).to_arrow()
 
 
 def test_compact_cleanup(tmp_db: DBConnection):