From 92f0b16e46f343a66d539c20e88df64d586037bb Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 21 Apr 2025 13:42:13 -0700 Subject: [PATCH] fix(python): make sure pandas is optional (#2346) Fixes #2344 ## Summary by CodeRabbit - **Tests** - Updated tests to use PyArrow Tables instead of pandas DataFrames where possible, reducing reliance on pandas. - Tests that require pandas are now automatically skipped if pandas is not installed. - **Chores** - Improved workflow to uninstall both pylance and pandas in a specific test step. --- .github/workflows/python.yml | 4 +- python/python/lancedb/common.py | 4 +- python/python/tests/test_table.py | 73 +++++++++++++++++-------------- 3 files changed, 43 insertions(+), 38 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 9fca789c..125336a7 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -136,9 +136,9 @@ jobs: - uses: ./.github/workflows/run_tests with: integration: true - - name: Test without pylance + - name: Test without pylance or pandas run: | - pip uninstall -y pylance + pip uninstall -y pylance pandas pytest -vv python/tests/test_table.py # Make sure wheels are not included in the Rust cache - name: Delete wheels diff --git a/python/python/lancedb/common.py b/python/python/lancedb/common.py index 253c51a3..a34078ee 100644 --- a/python/python/lancedb/common.py +++ b/python/python/lancedb/common.py @@ -9,7 +9,7 @@ import numpy as np import pyarrow as pa import pyarrow.dataset -from .dependencies import pandas as pd +from .dependencies import _check_for_pandas, pandas as pd DATA = Union[List[dict], "pd.DataFrame", pa.Table, Iterable[pa.RecordBatch]] VEC = Union[list, np.ndarray, pa.Array, pa.ChunkedArray] @@ -63,7 +63,7 @@ def data_to_reader( data: DATA, schema: Optional[pa.Schema] = None ) -> pa.RecordBatchReader: """Convert various types of input into a RecordBatchReader""" - if pd is not None and isinstance(data, pd.DataFrame): + if _check_for_pandas(data) and isinstance(data, pd.DataFrame): return pa.Table.from_pandas(data, schema=schema).to_reader() elif isinstance(data, pa.Table): return data.to_reader() diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py index 2bb18989..6624de19 100644 --- a/python/python/tests/test_table.py +++ b/python/python/tests/test_table.py @@ -9,9 +9,9 @@ from typing import List from unittest.mock import patch import lancedb +from lancedb.dependencies import _PANDAS_AVAILABLE from lancedb.index import HnswPq, HnswSq, IvfPq import numpy as np -import pandas as pd import polars as pl import pyarrow as pa import pyarrow.dataset @@ -138,13 +138,16 @@ def test_create_table(mem_db: DBConnection): {"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}, ] - df = pd.DataFrame(rows) - pa_table = pa.Table.from_pandas(df, schema=schema) + pa_table = pa.Table.from_pylist(rows, schema=schema) data = [ ("Rows", rows), - ("pd_DataFrame", df), ("pa_Table", pa_table), ] + if _PANDAS_AVAILABLE: + import pandas as pd + + df = pd.DataFrame(rows) + data.append(("pd_DataFrame", df)) for name, d in data: tbl = mem_db.create_table(name, data=d, schema=schema).to_arrow() @@ -296,7 +299,7 @@ def test_add_subschema(mem_db: DBConnection): data = {"price": 10.0, "item": "foo"} table.add([data]) - data = pd.DataFrame({"price": [2.0], "vector": [[3.1, 4.1]]}) + data = pa.Table.from_pydict({"price": [2.0], "vector": [[3.1, 4.1]]}) table.add(data) data = {"price": 3.0, "vector": [5.9, 26.5], "item": "bar"} table.add([data]) @@ -405,6 +408,7 @@ def test_add_nullability(mem_db: DBConnection): def test_add_pydantic_model(mem_db: DBConnection): + pytest.importorskip("pandas") # https://github.com/lancedb/lancedb/issues/562 class Metadata(BaseModel): @@ -473,10 +477,10 @@ def test_polars(mem_db: DBConnection): table = mem_db.create_table("test", data=pl.DataFrame(data)) assert len(table) == 2 - result = table.to_pandas() - assert np.allclose(result["vector"].tolist(), data["vector"]) - assert result["item"].tolist() == data["item"] - assert np.allclose(result["price"].tolist(), data["price"]) + result = table.to_arrow() + assert np.allclose(result["vector"].to_pylist(), data["vector"]) + assert result["item"].to_pylist() == data["item"] + assert np.allclose(result["price"].to_pylist(), data["price"]) schema = pa.schema( [ @@ -688,7 +692,7 @@ def test_delete(mem_db: DBConnection): assert len(table.list_versions()) == 2 assert table.version == 2 assert len(table) == 1 - assert table.to_pandas()["id"].tolist() == [1] + assert table.to_arrow()["id"].to_pylist() == [1] def test_update(mem_db: DBConnection): @@ -852,6 +856,7 @@ def test_merge_insert(mem_db: DBConnection): ids=["pa.Table", "pd.DataFrame", "rows"], ) def test_merge_insert_subschema(mem_db: DBConnection, data_format): + pytest.importorskip("pandas") initial_data = pa.table( {"id": range(3), "a": [1.0, 2.0, 3.0], "c": ["x", "x", "x"]} ) @@ -948,7 +953,7 @@ def test_create_with_embedding_function(mem_db: DBConnection): func = MockTextEmbeddingFunction.create() texts = ["hello world", "goodbye world", "foo bar baz fizz buzz"] - df = pd.DataFrame({"text": texts, "vector": func.compute_source_embeddings(texts)}) + df = pa.table({"text": texts, "vector": func.compute_source_embeddings(texts)}) conf = EmbeddingFunctionConfig( source_column="text", vector_column="vector", function=func @@ -973,7 +978,7 @@ def test_create_f16_table(mem_db: DBConnection): text: str vector: Vector(32, value_type=pa.float16()) - df = pd.DataFrame( + df = pa.table( { "text": [f"s-{i}" for i in range(512)], "vector": [np.random.randn(32).astype(np.float16) for _ in range(512)], @@ -986,7 +991,7 @@ def test_create_f16_table(mem_db: DBConnection): table.add(df) table.create_index(num_partitions=2, num_sub_vectors=2) - query = df.vector.iloc[2] + query = df["vector"][2].as_py() expected = table.search(query).limit(2).to_arrow() assert "s-2" in expected["text"].to_pylist() @@ -1002,7 +1007,7 @@ def test_add_with_embedding_function(mem_db: DBConnection): table = mem_db.create_table("my_table", schema=MyTable) texts = ["hello world", "goodbye world", "foo bar baz fizz buzz"] - df = pd.DataFrame({"text": texts}) + df = pa.table({"text": texts}) table.add(df) texts = ["the quick brown fox", "jumped over the lazy dog"] @@ -1033,14 +1038,14 @@ def test_multiple_vector_columns(mem_db: DBConnection): {"vector1": v1, "vector2": v2, "text": "foo"}, {"vector1": v2, "vector2": v1, "text": "bar"}, ] - df = pd.DataFrame(data) + df = pa.Table.from_pylist(data) table.add(df) q = np.random.randn(10) - result1 = table.search(q, vector_column_name="vector1").limit(1).to_pandas() - result2 = table.search(q, vector_column_name="vector2").limit(1).to_pandas() + result1 = table.search(q, vector_column_name="vector1").limit(1).to_arrow() + result2 = table.search(q, vector_column_name="vector2").limit(1).to_arrow() - assert result1["text"].iloc[0] != result2["text"].iloc[0] + assert result1["text"][0] != result2["text"][0] def test_create_scalar_index(mem_db: DBConnection): @@ -1078,22 +1083,22 @@ def test_empty_query(mem_db: DBConnection): "my_table", data=[{"text": "foo", "id": 0}, {"text": "bar", "id": 1}], ) - df = table.search().select(["id"]).where("text='bar'").limit(1).to_pandas() - val = df.id.iloc[0] + df = table.search().select(["id"]).where("text='bar'").limit(1).to_arrow() + val = df["id"][0].as_py() assert val == 1 table = mem_db.create_table("my_table2", data=[{"id": i} for i in range(100)]) - df = table.search().select(["id"]).to_pandas() - assert len(df) == 100 + df = table.search().select(["id"]).to_arrow() + assert df.num_rows == 100 # None is the same as default - df = table.search().select(["id"]).limit(None).to_pandas() - assert len(df) == 100 + df = table.search().select(["id"]).limit(None).to_arrow() + assert df.num_rows == 100 # invalid limist is the same as None, wihch is the same as default - df = table.search().select(["id"]).limit(-1).to_pandas() - assert len(df) == 100 + df = table.search().select(["id"]).limit(-1).to_arrow() + assert df.num_rows == 100 # valid limit should work - df = table.search().select(["id"]).limit(42).to_pandas() - assert len(df) == 42 + df = table.search().select(["id"]).limit(42).to_arrow() + assert df.num_rows == 42 def test_search_with_schema_inf_single_vector(mem_db: DBConnection): @@ -1112,14 +1117,14 @@ def test_search_with_schema_inf_single_vector(mem_db: DBConnection): {"vector_col": v1, "text": "foo"}, {"vector_col": v2, "text": "bar"}, ] - df = pd.DataFrame(data) + df = pa.Table.from_pylist(data) table.add(df) q = np.random.randn(10) - result1 = table.search(q, vector_column_name="vector_col").limit(1).to_pandas() - result2 = table.search(q).limit(1).to_pandas() + result1 = table.search(q, vector_column_name="vector_col").limit(1).to_arrow() + result2 = table.search(q).limit(1).to_arrow() - assert result1["text"].iloc[0] == result2["text"].iloc[0] + assert result1["text"][0].as_py() == result2["text"][0].as_py() def test_search_with_schema_inf_multiple_vector(mem_db: DBConnection): @@ -1139,12 +1144,12 @@ def test_search_with_schema_inf_multiple_vector(mem_db: DBConnection): {"vector1": v1, "vector2": v2, "text": "foo"}, {"vector1": v2, "vector2": v1, "text": "bar"}, ] - df = pd.DataFrame(data) + df = pa.Table.from_pylist(data) table.add(df) q = np.random.randn(10) with pytest.raises(ValueError): - table.search(q).limit(1).to_pandas() + table.search(q).limit(1).to_arrow() def test_compact_cleanup(tmp_db: DBConnection):