From f124c9d8d2a73c684a46fe07b478c1d397c50fa5 Mon Sep 17 00:00:00 2001
From: Jack Ye <yezhaoqin@gmail.com>
Date: Wed, 21 Jan 2026 13:40:48 -0800
Subject: [PATCH] test: string type conversion in pandas 3.0+ (#2928)

Pandas 3.0+ string now converts to Arrow large_utf8. This PR mainly
makes sure our test accounts for the difference across the pandas
versions when constructing schema.
---
 python/python/tests/conftest.py  | 15 +++++++++++++++
 python/python/tests/test_db.py   | 17 +++++++++++++----
 python/python/tests/test_util.py | 13 ++++++++++---
 3 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/python/python/tests/conftest.py b/python/python/tests/conftest.py
index e3205216e..8567a9830 100644
--- a/python/python/tests/conftest.py
+++ b/python/python/tests/conftest.py
@@ -2,12 +2,27 @@
 # SPDX-FileCopyrightText: Copyright The LanceDB Authors
 
 from datetime import timedelta
+
 from lancedb.db import AsyncConnection, DBConnection
 import lancedb
 import pytest
 import pytest_asyncio
 
 
+def pandas_string_type():
+    """Return the PyArrow string type that pandas uses for string columns.
+
+    pandas 3.0+ uses large_string for string columns, pandas 2.x uses string.
+    """
+    import pandas as pd
+    import pyarrow as pa
+
+    version = tuple(int(x) for x in pd.__version__.split(".")[:2])
+    if version >= (3, 0):
+        return pa.large_utf8()
+    return pa.utf8()
+
+
 # Use an in-memory database for most tests.
 @pytest.fixture
 def mem_db() -> DBConnection:
diff --git a/python/python/tests/test_db.py b/python/python/tests/test_db.py
index e7c2beb9f..df922342e 100644
--- a/python/python/tests/test_db.py
+++ b/python/python/tests/test_db.py
@@ -268,6 +268,8 @@ async def test_create_table_from_iterator_async(mem_db_async: lancedb.AsyncConne
 
 
 def test_create_exist_ok(tmp_db: lancedb.DBConnection):
+    from conftest import pandas_string_type
+
     data = pd.DataFrame(
         {
             "vector": [[3.1, 4.1], [5.9, 26.5]],
@@ -286,10 +288,11 @@ def test_create_exist_ok(tmp_db: lancedb.DBConnection):
     assert tbl.schema == tbl2.schema
     assert len(tbl) == len(tbl2)
 
+    # pandas 3.0+ uses large_string, pandas 2.x uses string
     schema = pa.schema(
         [
             pa.field("vector", pa.list_(pa.float32(), list_size=2)),
-            pa.field("item", pa.utf8()),
+            pa.field("item", pandas_string_type()),
             pa.field("price", pa.float64()),
         ]
     )
@@ -299,7 +302,7 @@ def test_create_exist_ok(tmp_db: lancedb.DBConnection):
     bad_schema = pa.schema(
         [
             pa.field("vector", pa.list_(pa.float32(), list_size=2)),
-            pa.field("item", pa.utf8()),
+            pa.field("item", pandas_string_type()),
             pa.field("price", pa.float64()),
             pa.field("extra", pa.float32()),
         ]
@@ -365,6 +368,8 @@ async def test_create_mode_async(tmp_db_async: lancedb.AsyncConnection):
 
 @pytest.mark.asyncio
 async def test_create_exist_ok_async(tmp_db_async: lancedb.AsyncConnection):
+    from conftest import pandas_string_type
+
     data = pd.DataFrame(
         {
             "vector": [[3.1, 4.1], [5.9, 26.5]],
@@ -382,10 +387,11 @@ async def test_create_exist_ok_async(tmp_db_async: lancedb.AsyncConnection):
     assert tbl.name == tbl2.name
     assert await tbl.schema() == await tbl2.schema()
 
+    # pandas 3.0+ uses large_string, pandas 2.x uses string
     schema = pa.schema(
         [
             pa.field("vector", pa.list_(pa.float32(), list_size=2)),
-            pa.field("item", pa.utf8()),
+            pa.field("item", pandas_string_type()),
             pa.field("price", pa.float64()),
         ]
     )
@@ -595,6 +601,8 @@ def test_open_table_sync(tmp_db: lancedb.DBConnection):
 
 @pytest.mark.asyncio
 async def test_open_table(tmp_path):
+    from conftest import pandas_string_type
+
     db = await lancedb.connect_async(tmp_path)
     data = pd.DataFrame(
         {
@@ -614,10 +622,11 @@ async def test_open_table(tmp_path):
         )
         is not None
     )
+    # pandas 3.0+ uses large_string, pandas 2.x uses string
     assert await tbl.schema() == pa.schema(
         {
             "vector": pa.list_(pa.float32(), list_size=2),
-            "item": pa.utf8(),
+            "item": pandas_string_type(),
             "price": pa.float64(),
         }
     )
diff --git a/python/python/tests/test_util.py b/python/python/tests/test_util.py
index 4ff120f47..a847deaca 100644
--- a/python/python/tests/test_util.py
+++ b/python/python/tests/test_util.py
@@ -528,12 +528,19 @@ def test_sanitize_data(
         else:
             expected_schema = schema
     else:
+        from conftest import pandas_string_type
+
+        # polars uses large_string, pandas 3.0+ uses large_string, others use string
+        if isinstance(data, pl.DataFrame):
+            text_type = pa.large_utf8()
+        elif isinstance(data, pd.DataFrame):
+            text_type = pandas_string_type()
+        else:
+            text_type = pa.string()
         expected_schema = pa.schema(
             {
                 "id": pa.int64(),
-                "text": pa.large_utf8()
-                if isinstance(data, pl.DataFrame)
-                else pa.string(),
+                "text": text_type,
                 "vector": pa.list_(pa.float32(), 10),
             }
         )