From b2a38ac3663cf1358601f1e1380e3e2eae13d8a3 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 21 Mar 2025 11:26:32 -0700 Subject: [PATCH] fix: make pylance optional again (#2209) The two remaining blockers were: * A method `with_embeddings` that was deprecated a year ago * A typecheck for `LanceDataset` --- .github/workflows/python.yml | 9 ++++ docs/src/python/python.md | 2 - python/pyproject.toml | 4 +- python/python/lancedb/common.py | 5 +- python/python/lancedb/context.py | 4 +- python/python/lancedb/dependencies.py | 13 +++++ python/python/lancedb/embeddings/__init__.py | 1 - python/python/lancedb/embeddings/utils.py | 51 +------------------ python/python/lancedb/integrations/pyarrow.py | 1 + python/python/lancedb/query.py | 5 +- python/python/lancedb/table.py | 21 ++++---- python/python/lancedb/util.py | 18 ------- python/python/tests/test_embeddings.py | 18 ------- python/python/tests/test_table.py | 8 ++- 14 files changed, 49 insertions(+), 111 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index c30bac92..9fca789c 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -13,6 +13,11 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +env: + # Color output for pytest is off by default. + PYTEST_ADDOPTS: "--color=yes" + FORCE_COLOR: "1" + jobs: lint: name: "Lint" @@ -131,6 +136,10 @@ jobs: - uses: ./.github/workflows/run_tests with: integration: true + - name: Test without pylance + run: | + pip uninstall -y pylance + pytest -vv python/tests/test_table.py # Make sure wheels are not included in the Rust cache - name: Delete wheels run: rm -rf target/wheels diff --git a/docs/src/python/python.md b/docs/src/python/python.md index 00dcd84b..8e2c456b 100644 --- a/docs/src/python/python.md +++ b/docs/src/python/python.md @@ -59,8 +59,6 @@ is also an [asynchronous API client](#connections-asynchronous). ::: lancedb.embeddings.open_clip.OpenClipEmbeddings -::: lancedb.embeddings.utils.with_embeddings - ## Context ::: lancedb.context.contextualize diff --git a/python/pyproject.toml b/python/pyproject.toml index 64b73c33..b1037a85 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -9,7 +9,6 @@ dependencies = [ "pydantic>=1.10", "packaging", "overrides>=0.7", - "pylance>=0.23.2", ] description = "lancedb" authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }] @@ -55,6 +54,7 @@ tests = [ "polars>=0.19, <=1.3.0", "tantivy", "pyarrow-stubs", + "pylance>=0.23.2", ] dev = [ "ruff", @@ -63,7 +63,7 @@ dev = [ 'typing-extensions>=4.0.0; python_version < "3.11"', ] docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"] -clip = ["torch", "pillow", "open-clip"] +clip = ["torch", "pillow", "open-clip-torch"] embeddings = [ "requests>=2.31.0", "openai>=1.6.1", diff --git a/python/python/lancedb/common.py b/python/python/lancedb/common.py index 07ae8efe..253c51a3 100644 --- a/python/python/lancedb/common.py +++ b/python/python/lancedb/common.py @@ -7,10 +7,9 @@ from typing import Iterable, List, Optional, Union import numpy as np import pyarrow as pa +import pyarrow.dataset -from .util import safe_import_pandas - -pd = safe_import_pandas() +from .dependencies import pandas as pd DATA = Union[List[dict], "pd.DataFrame", pa.Table, Iterable[pa.RecordBatch]] VEC = Union[list, np.ndarray, pa.Array, pa.ChunkedArray] diff --git a/python/python/lancedb/context.py b/python/python/lancedb/context.py index b0e085ff..03c87db5 100644 --- a/python/python/lancedb/context.py +++ b/python/python/lancedb/context.py @@ -8,9 +8,7 @@ import deprecation from . import __version__ from .exceptions import MissingColumnError, MissingValueError -from .util import safe_import_pandas - -pd = safe_import_pandas() +from .dependencies import pandas as pd def contextualize(raw_df: "pd.DataFrame") -> Contextualizer: diff --git a/python/python/lancedb/dependencies.py b/python/python/lancedb/dependencies.py index 50fc5534..0e420c89 100644 --- a/python/python/lancedb/dependencies.py +++ b/python/python/lancedb/dependencies.py @@ -30,6 +30,7 @@ _TORCH_AVAILABLE = True _HUGGING_FACE_AVAILABLE = True _TENSORFLOW_AVAILABLE = True _RAY_AVAILABLE = True +_LANCE_AVAILABLE = True class _LazyModule(ModuleType): @@ -53,6 +54,7 @@ class _LazyModule(ModuleType): "torch": "torch.", "tensorflow": "tf.", "ray": "ray.", + "lance": "lance.", } def __init__( @@ -169,6 +171,7 @@ if TYPE_CHECKING: import ray import tensorflow import torch + import lance else: # heavy/optional third party libs numpy, _NUMPY_AVAILABLE = _lazy_import("numpy") @@ -178,6 +181,7 @@ else: datasets, _HUGGING_FACE_AVAILABLE = _lazy_import("datasets") tensorflow, _TENSORFLOW_AVAILABLE = _lazy_import("tensorflow") ray, _RAY_AVAILABLE = _lazy_import("ray") + lance, _LANCE_AVAILABLE = _lazy_import("lance") @lru_cache(maxsize=None) @@ -232,6 +236,12 @@ def _check_for_ray(obj: Any, *, check_type: bool = True) -> bool: ) +def _check_for_lance(obj: Any, *, check_type: bool = True) -> bool: + return _LANCE_AVAILABLE and _might_be( + cast(Hashable, type(obj) if check_type else obj), "lance" + ) + + __all__ = [ # lazy-load third party libs "datasets", @@ -241,6 +251,7 @@ __all__ = [ "ray", "tensorflow", "torch", + "lance", # lazy utilities "_check_for_hugging_face", "_check_for_numpy", @@ -249,6 +260,7 @@ __all__ = [ "_check_for_tensorflow", "_check_for_torch", "_check_for_ray", + "_check_for_lance", "_LazyModule", # exported flags/guards "_NUMPY_AVAILABLE", @@ -258,4 +270,5 @@ __all__ = [ "_HUGGING_FACE_AVAILABLE", "_TENSORFLOW_AVAILABLE", "_RAY_AVAILABLE", + "_LANCE_AVAILABLE", ] diff --git a/python/python/lancedb/embeddings/__init__.py b/python/python/lancedb/embeddings/__init__.py index 0b9680d4..c4854fd1 100644 --- a/python/python/lancedb/embeddings/__init__.py +++ b/python/python/lancedb/embeddings/__init__.py @@ -16,7 +16,6 @@ from .sentence_transformers import SentenceTransformerEmbeddings from .gte import GteEmbeddings from .transformers import TransformersEmbeddingFunction, ColbertEmbeddings from .imagebind import ImageBindEmbeddings -from .utils import with_embeddings from .jinaai import JinaEmbeddings from .watsonx import WatsonxEmbeddings from .voyageai import VoyageAIEmbeddingFunction diff --git a/python/python/lancedb/embeddings/utils.py b/python/python/lancedb/embeddings/utils.py index cf7cf104..6a4c577c 100644 --- a/python/python/lancedb/embeddings/utils.py +++ b/python/python/lancedb/embeddings/utils.py @@ -16,9 +16,8 @@ from functools import wraps from typing import Callable, List, Union import numpy as np import pyarrow as pa -from lance.vector import vec_to_table -from ..util import deprecated, safe_import_pandas +from ..dependencies import pandas as pd # ruff: noqa: PERF203 @@ -41,8 +40,6 @@ def retry(tries=10, delay=1, max_delay=30, backoff=3, jitter=1): return wrapper -pd = safe_import_pandas() - DATA = Union[pa.Table, "pd.DataFrame"] TEXT = Union[str, List[str], pa.Array, pa.ChunkedArray, np.ndarray] IMAGES = Union[ @@ -87,52 +84,6 @@ class RateLimiter: return wrapper -@deprecated -def with_embeddings( - func: Callable, - data: DATA, - column: str = "text", - wrap_api: bool = True, - show_progress: bool = False, - batch_size: int = 1000, -) -> pa.Table: - """Add a vector column to a table using the given embedding function. - - The new columns will be called "vector". - - Parameters - ---------- - func : Callable - A function that takes a list of strings and returns a list of vectors. - data : pa.Table or pd.DataFrame - The data to add an embedding column to. - column : str, default "text" - The name of the column to use as input to the embedding function. - wrap_api : bool, default True - Whether to wrap the embedding function in a retry and rate limiter. - show_progress : bool, default False - Whether to show a progress bar. - batch_size : int, default 1000 - The number of row values to pass to each call of the embedding function. - - Returns - ------- - pa.Table - The input table with a new column called "vector" containing the embeddings. - """ - func = FunctionWrapper(func) - if wrap_api: - func = func.retry().rate_limit() - func = func.batch_size(batch_size) - if show_progress: - func = func.show_progress() - if pd is not None and isinstance(data, pd.DataFrame): - data = pa.Table.from_pandas(data, preserve_index=False) - embeddings = func(data[column].to_numpy()) - table = vec_to_table(np.array(embeddings)) - return data.append_column("vector", table["vector"]) - - class FunctionWrapper: """ A wrapper for embedding functions that adds rate limiting, retries, and batching. diff --git a/python/python/lancedb/integrations/pyarrow.py b/python/python/lancedb/integrations/pyarrow.py index db4353e3..08596469 100644 --- a/python/python/lancedb/integrations/pyarrow.py +++ b/python/python/lancedb/integrations/pyarrow.py @@ -5,6 +5,7 @@ import logging from typing import Any, List, Optional, Tuple, Union, Literal import pyarrow as pa +import pyarrow.dataset from ..table import Table diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index 348da579..fdbc8751 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -26,10 +26,11 @@ import pydantic from . import __version__ from .arrow import AsyncRecordBatchReader +from .dependencies import pandas as pd from .rerankers.base import Reranker from .rerankers.rrf import RRFReranker from .rerankers.util import check_reranker_result -from .util import safe_import_pandas, flatten_columns +from .util import flatten_columns if TYPE_CHECKING: import sys @@ -49,8 +50,6 @@ if TYPE_CHECKING: else: from typing_extensions import Self -pd = safe_import_pandas() - class Query(pydantic.BaseModel): """The LanceDB Query diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index aebc2c82..ac7633ab 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -28,12 +28,19 @@ from urllib.parse import urlparse from . import __version__ from lancedb.arrow import peek_reader from lancedb.background_loop import LOOP -from .dependencies import _check_for_hugging_face, _check_for_pandas +from .dependencies import ( + _check_for_hugging_face, + _check_for_lance, + _check_for_pandas, + lance, + pandas as pd, + polars as pl, +) import pyarrow as pa +import pyarrow.dataset import pyarrow.compute as pc import pyarrow.fs as pa_fs import numpy as np -from lance import LanceDataset from .common import DATA, VEC, VECTOR_COLUMN_NAME from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry @@ -58,8 +65,6 @@ from .util import ( get_uri_scheme, infer_vector_column_name, join_uri, - safe_import_pandas, - safe_import_polars, value_to_sql, ) from .index import lang_mapping @@ -88,10 +93,6 @@ if TYPE_CHECKING: ) -pd = safe_import_pandas() -pl = safe_import_polars() - - def _into_pyarrow_reader(data) -> pa.RecordBatchReader: from lancedb.dependencies import datasets @@ -130,7 +131,7 @@ def _into_pyarrow_reader(data) -> pa.RecordBatchReader: return data.to_reader() elif isinstance(data, pa.RecordBatch): return pa.RecordBatchReader.from_batches(data.schema, [data]) - elif isinstance(data, LanceDataset): + elif _check_for_lance(data) and isinstance(data, lance.LanceDataset): return data.scanner().to_reader() elif isinstance(data, pa.dataset.Dataset): return data.scanner().to_reader() @@ -1440,7 +1441,7 @@ class LanceTable(Table): # Cacheable since it's deterministic return _table_path(self._conn.uri, self.name) - def to_lance(self, **kwargs) -> LanceDataset: + def to_lance(self, **kwargs) -> lance.LanceDataset: """Return the LanceDataset backing this table.""" try: import lance diff --git a/python/python/lancedb/util.py b/python/python/lancedb/util.py index 3eb70d20..2139252e 100644 --- a/python/python/lancedb/util.py +++ b/python/python/lancedb/util.py @@ -157,24 +157,6 @@ def attempt_import_or_raise(module: str, mitigation=None): raise ImportError(f"Please install {mitigation or module}") -def safe_import_pandas(): - try: - import pandas as pd - - return pd - except ImportError: - return None - - -def safe_import_polars(): - try: - import polars as pl - - return pl - except ImportError: - return None - - def flatten_columns(tbl: pa.Table, flatten: Optional[Union[int, bool]] = None): """ Flatten all struct columns in a table. diff --git a/python/python/tests/test_embeddings.py b/python/python/tests/test_embeddings.py index c3cc32b5..3284f11f 100644 --- a/python/python/tests/test_embeddings.py +++ b/python/python/tests/test_embeddings.py @@ -15,7 +15,6 @@ from lancedb.conftest import MockTextEmbeddingFunction from lancedb.embeddings import ( EmbeddingFunctionConfig, EmbeddingFunctionRegistry, - with_embeddings, ) from lancedb.embeddings.base import TextEmbeddingFunction from lancedb.embeddings.registry import get_registry, register @@ -27,23 +26,6 @@ def mock_embed_func(input_data): return [np.random.randn(128).tolist() for _ in range(len(input_data))] -def test_with_embeddings(): - for wrap_api in [True, False]: - data = pa.Table.from_arrays( - [ - pa.array(["foo", "bar"]), - pa.array([10.0, 20.0]), - ], - names=["text", "price"], - ) - data = with_embeddings(mock_embed_func, data, wrap_api=wrap_api) - assert data.num_columns == 3 - assert data.num_rows == 2 - assert data.column_names == ["text", "price", "vector"] - assert data.column("text").to_pylist() == ["foo", "bar"] - assert data.column("price").to_pylist() == [10.0, 20.0] - - def test_embedding_function(tmp_path): registry = EmbeddingFunctionRegistry.get_instance() diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py index ad5c6fe3..e816dc71 100644 --- a/python/python/tests/test_table.py +++ b/python/python/tests/test_table.py @@ -8,13 +8,13 @@ from time import sleep from typing import List from unittest.mock import patch -import lance import lancedb from lancedb.index import HnswPq, HnswSq, IvfPq import numpy as np import pandas as pd import polars as pl import pyarrow as pa +import pyarrow.dataset import pytest from lancedb.conftest import MockTextEmbeddingFunction from lancedb.db import AsyncConnection, DBConnection @@ -650,6 +650,9 @@ def test_restore(mem_db: DBConnection): def test_merge(tmp_db: DBConnection, tmp_path): + pytest.importorskip("lance") + import lance + table = tmp_db.create_table( "my_table", schema=pa.schema( @@ -1145,6 +1148,7 @@ def test_search_with_schema_inf_multiple_vector(mem_db: DBConnection): def test_compact_cleanup(tmp_db: DBConnection): + pytest.importorskip("lance") table = tmp_db.create_table( "my_table", data=[{"text": "foo", "id": 0}, {"text": "bar", "id": 1}], @@ -1222,6 +1226,7 @@ def setup_hybrid_search_table(db: DBConnection, embedding_func): def test_hybrid_search(tmp_db: DBConnection): # This test uses an FTS index pytest.importorskip("lancedb.fts") + pytest.importorskip("lance") table, MyTable, emb = setup_hybrid_search_table(tmp_db, "test") @@ -1292,6 +1297,7 @@ def test_hybrid_search(tmp_db: DBConnection): def test_hybrid_search_metric_type(tmp_db: DBConnection): # This test uses an FTS index pytest.importorskip("lancedb.fts") + pytest.importorskip("lance") # Need to use nonnorm as the embedding function so l2 and dot results # are different