From 01e4291d2123a5efaedd01bc8cd258dfc93c56c9 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 26 Feb 2025 15:53:45 -0800 Subject: [PATCH] feat(python): drop hard dependency on pylance (#2156) Closes #1793 --- python/pyproject.toml | 3 +- python/python/lancedb/_lancedb.pyi | 4 +++ python/python/lancedb/table.py | 52 +++++++++++++++++++++--------- 3 files changed, 43 insertions(+), 16 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index c5207291..e90e27c1 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -4,8 +4,8 @@ name = "lancedb" dynamic = ["version"] dependencies = [ "deprecation", - "pylance~=0.23.2", "tqdm>=4.27.0", + "pyarrow>=14", "pydantic>=1.10", "packaging", "overrides>=0.7", @@ -54,6 +54,7 @@ tests = [ "polars>=0.19, <=1.3.0", "tantivy", "pyarrow-stubs", + "pylance~=0.23.2", ] dev = [ "ruff", diff --git a/python/python/lancedb/_lancedb.pyi b/python/python/lancedb/_lancedb.pyi index 993ac03a..8ac3ec07 100644 --- a/python/python/lancedb/_lancedb.pyi +++ b/python/python/lancedb/_lancedb.pyi @@ -142,6 +142,10 @@ class CompactionStats: files_removed: int files_added: int +class CleanupStats: + bytes_removed: int + old_versions: int + class RemovalStats: bytes_removed: int old_versions_removed: int diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index ce084b1d..cf077ba5 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -5,6 +5,7 @@ from __future__ import annotations import asyncio import inspect +import deprecation import warnings from abc import ABC, abstractmethod from dataclasses import dataclass @@ -24,16 +25,15 @@ from typing import ( ) from urllib.parse import urlparse -import lance +from . import __version__ from lancedb.arrow import peek_reader from lancedb.background_loop import LOOP -from .dependencies import _check_for_pandas +from .dependencies import _check_for_hugging_face, _check_for_pandas import pyarrow as pa import pyarrow.compute as pc import pyarrow.fs as pa_fs import numpy as np from lance import LanceDataset -from lance.dependencies import _check_for_hugging_face from .common import DATA, VEC, VECTOR_COLUMN_NAME from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry @@ -66,10 +66,14 @@ from .index import lang_mapping if TYPE_CHECKING: - from ._lancedb import Table as LanceDBTable, OptimizeStats, CompactionStats + from ._lancedb import ( + Table as LanceDBTable, + OptimizeStats, + CleanupStats, + CompactionStats, + ) from .db import LanceDBConnection from .index import IndexConfig - from lance.dataset import CleanupStats, ReaderLike import pandas import PIL @@ -80,10 +84,9 @@ QueryType = Literal["vector", "fts", "hybrid", "auto"] def _into_pyarrow_reader(data) -> pa.RecordBatchReader: - if _check_for_hugging_face(data): - # Huggingface datasets - from lance.dependencies import datasets + from lancedb.dependencies import datasets + if _check_for_hugging_face(data): if isinstance(data, datasets.Dataset): schema = data.features.arrow_schema return pa.RecordBatchReader.from_batches(schema, data.data.to_batches()) @@ -1074,7 +1077,7 @@ class Table(ABC): older_than: Optional[timedelta] = None, *, delete_unverified: bool = False, - ) -> CleanupStats: + ) -> "CleanupStats": """ Clean up old versions of the table, freeing disk space. @@ -1385,6 +1388,14 @@ class LanceTable(Table): def to_lance(self, **kwargs) -> LanceDataset: """Return the LanceDataset backing this table.""" + try: + import lance + except ImportError: + raise ImportError( + "The lance library is required to use this function. " + "Please install with `pip install pylance`." + ) + return lance.dataset( self._dataset_path, version=self.version, @@ -1844,7 +1855,7 @@ class LanceTable(Table): def merge( self, - other_table: Union[LanceTable, ReaderLike], + other_table: Union[LanceTable, DATA], left_on: str, right_on: Optional[str] = None, schema: Optional[Union[pa.Schema, LanceModel]] = None, @@ -1894,12 +1905,13 @@ class LanceTable(Table): 1 2 b e 2 3 c f """ - if isinstance(schema, LanceModel): - schema = schema.to_arrow_schema() if isinstance(other_table, LanceTable): other_table = other_table.to_lance() - if isinstance(other_table, LanceDataset): - other_table = other_table.to_table() + else: + other_table = _sanitize_data( + other_table, + schema, + ) self.to_lance().merge( other_table, left_on=left_on, right_on=right_on, schema=schema ) @@ -2222,12 +2234,17 @@ class LanceTable(Table): ): LOOP.run(self._table._do_merge(merge, new_data, on_bad_vectors, fill_value)) + @deprecation.deprecated( + deprecated_in="0.21.0", + current_version=__version__, + details="Use `Table.optimize` instead.", + ) def cleanup_old_versions( self, older_than: Optional[timedelta] = None, *, delete_unverified: bool = False, - ) -> CleanupStats: + ) -> "CleanupStats": """ Clean up old versions of the table, freeing disk space. @@ -2252,6 +2269,11 @@ class LanceTable(Table): older_than, delete_unverified=delete_unverified ) + @deprecation.deprecated( + deprecated_in="0.21.0", + current_version=__version__, + details="Use `Table.optimize` instead.", + ) def compact_files(self, *args, **kwargs) -> CompactionStats: """ Run the compaction process on the table.