diff --git a/python/python/lancedb/index.py b/python/python/lancedb/index.py index f3f4d6a6e..67656d8a3 100644 --- a/python/python/lancedb/index.py +++ b/python/python/lancedb/index.py @@ -281,6 +281,9 @@ class HnswPq: m: int = 20 ef_construction: int = 300 target_partition_size: Optional[int] = None + # Name of the accelerator (e.g. "cuda") to use for IVF training. When set, + # create_index() dispatches to pylance to build the index on the accelerator. + accelerator: Optional[str] = None @dataclass @@ -386,6 +389,9 @@ class HnswSq: m: int = 20 ef_construction: int = 300 target_partition_size: Optional[int] = None + # Name of the accelerator (e.g. "cuda") to use for IVF training. When set, + # create_index() dispatches to pylance to build the index on the accelerator. + accelerator: Optional[str] = None @dataclass @@ -579,6 +585,9 @@ class IvfFlat: max_iterations: int = 50 sample_rate: int = 256 target_partition_size: Optional[int] = None + # Name of the accelerator (e.g. "cuda") to use for IVF training. When set, + # create_index() dispatches to pylance to build the index on the accelerator. + accelerator: Optional[str] = None @dataclass @@ -609,6 +618,9 @@ class IvfSq: max_iterations: int = 50 sample_rate: int = 256 target_partition_size: Optional[int] = None + # Name of the accelerator (e.g. "cuda") to use for IVF training. When set, + # create_index() dispatches to pylance to build the index on the accelerator. + accelerator: Optional[str] = None @dataclass @@ -739,6 +751,9 @@ class IvfPq: max_iterations: int = 50 sample_rate: int = 256 target_partition_size: Optional[int] = None + # Name of the accelerator (e.g. "cuda") to use for IVF training. When set, + # create_index() dispatches to pylance to build the index on the accelerator. + accelerator: Optional[str] = None @dataclass @@ -792,6 +807,9 @@ class IvfRq: max_iterations: int = 50 sample_rate: int = 256 target_partition_size: Optional[int] = None + # Name of the accelerator (e.g. "cuda") to use for IVF training. When set, + # create_index() dispatches to pylance to build the index on the accelerator. + accelerator: Optional[str] = None __all__ = [ diff --git a/python/python/lancedb/remote/table.py b/python/python/lancedb/remote/table.py index c2fdcfae9..3d4155269 100644 --- a/python/python/lancedb/remote/table.py +++ b/python/python/lancedb/remote/table.py @@ -2,11 +2,24 @@ # SPDX-FileCopyrightText: Copyright The LanceDB Authors from datetime import timedelta +import deprecation import logging from functools import cached_property -from typing import Any, Callable, Dict, Iterable, List, Optional, Union, Literal +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Union, + Literal, + overload, +) import warnings +from lancedb import __version__ + from lancedb._lancedb import ( AddColumnsResult, AddResult, @@ -32,6 +45,7 @@ from lancedb.index import ( LabelList, ) from lancedb.remote.db import LOOP +from lancedb.table import IndexConfigType, KNOWN_METRICS import pyarrow as pa from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME @@ -122,6 +136,11 @@ class RemoteTable(Table): """List all the stats of a specified index""" return LOOP.run(self._table.index_stats(index_uuid)) + @deprecation.deprecated( + deprecated_in="0.25.0", + current_version=__version__, + details="Use create_index() with config=BTree()/Bitmap()/LabelList() instead.", + ) def create_scalar_index( self, column: str, @@ -131,7 +150,12 @@ class RemoteTable(Table): wait_timeout: Optional[timedelta] = None, name: Optional[str] = None, ): - """Creates a scalar index + """Creates a scalar index. + + .. deprecated:: 0.25.0 + Use :meth:`create_index` with a BTree, Bitmap, or LabelList config instead. + Example: ``table.create_index("column", config=BTree())`` + Parameters ---------- column : str @@ -162,6 +186,11 @@ class RemoteTable(Table): ) ) + @deprecation.deprecated( + deprecated_in="0.25.0", + current_version=__version__, + details="Use create_index() with config=FTS() instead.", + ) def create_fts_index( self, column: str, @@ -182,6 +211,12 @@ class RemoteTable(Table): prefix_only: bool = False, name: Optional[str] = None, ): + """Create a full-text search index on a column. + + .. deprecated:: 0.25.0 + Use :meth:`create_index` with an FTS config instead. + Example: ``table.create_index("text_column", config=FTS())`` + """ config = FTS( with_position=with_position, base_tokenizer=base_tokenizer, @@ -205,9 +240,43 @@ class RemoteTable(Table): ) ) + # New unified API overload + @overload def create_index( self, - metric="l2", + column: str, + /, + *, + config: IndexConfigType, + wait_timeout: Optional[timedelta] = ..., + name: Optional[str] = ..., + train: bool = ..., + ) -> None: ... + + # Legacy API overload (deprecated) + @overload + def create_index( + self, + metric: Literal["l2", "cosine", "dot", "hamming"] = ..., + vector_column_name: str = ..., + index_cache_size: Optional[int] = ..., + num_partitions: Optional[int] = ..., + num_sub_vectors: Optional[int] = ..., + replace: Optional[bool] = ..., + accelerator: Optional[str] = ..., + index_type: Literal[ + "VECTOR", "IVF_FLAT", "IVF_SQ", "IVF_PQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ" + ] = ..., + wait_timeout: Optional[timedelta] = ..., + *, + num_bits: int = ..., + name: Optional[str] = ..., + train: bool = ..., + ) -> None: ... + + def create_index( + self, + metric: str = "l2", vector_column_name: str = VECTOR_COLUMN_NAME, index_cache_size: Optional[int] = None, num_partitions: Optional[int] = None, @@ -218,89 +287,113 @@ class RemoteTable(Table): wait_timeout: Optional[timedelta] = None, *, num_bits: int = 8, + config: Optional[IndexConfigType] = None, name: Optional[str] = None, train: bool = True, ): - """Create an index on the table. + """Create an index on a column. - Parameters - ---------- - metric : str - The metric to use for the index. Default is "l2". - vector_column_name : str - The name of the vector column. Default is "vector". + This method supports both the new unified API and the legacy API + for backwards compatibility. The new API takes the column name as the + first positional argument and an index configuration object via + ``config``; the legacy API takes the distance metric as the first + argument plus separate ``vector_column_name`` / ``num_partitions`` / + etc. parameters, and emits a ``DeprecationWarning``. Examples -------- - >>> import lancedb - >>> import uuid - >>> from lancedb.schema import vector - >>> db = lancedb.connect("db://...", api_key="...", # doctest: +SKIP - ... region="...") # doctest: +SKIP - >>> table_name = uuid.uuid4().hex - >>> schema = pa.schema( - ... [ - ... pa.field("id", pa.uint32(), False), - ... pa.field("vector", vector(128), False), - ... pa.field("s", pa.string(), False), - ... ] + New API (recommended): + + >>> table.create_index( # doctest: +SKIP + ... "vector", config=IvfPq(distance_type="l2") ... ) - >>> table = db.create_table( # doctest: +SKIP - ... table_name, # doctest: +SKIP - ... schema=schema, # doctest: +SKIP + >>> table.create_index("category", config=BTree()) # doctest: +SKIP + >>> table.create_index("content", config=FTS()) # doctest: +SKIP + + Legacy API (deprecated): + + >>> table.create_index( # doctest: +SKIP + ... "l2", vector_column_name="vector" ... ) - >>> table.create_index("l2", "vector") # doctest: +SKIP """ + # Detect whether this is a legacy API call + is_legacy = self._is_legacy_create_index_call( + metric, + config, + num_partitions, + num_sub_vectors, + vector_column_name, + accelerator, + index_cache_size, + replace, + ) - if accelerator is not None: - logging.warning( - "GPU accelerator is not yet supported on LanceDB cloud." - "If you have 100M+ vectors to index," - "please contact us at contact@lancedb.com" - ) - if replace is not None: - logging.warning( - "replace is not supported on LanceDB cloud." - "Existing indexes will always be replaced." + if is_legacy: + warnings.warn( + "The create_index() API with metric/num_partitions parameters is " + "deprecated and will be removed in a future version. " + "Please migrate to the new unified API:\n" + " # Old (deprecated):\n" + " table.create_index('l2', vector_column_name='my_vector')\n" + " # New (recommended):\n" + " table.create_index('my_vector', config=IvfPq(distance_type='l2'))", + DeprecationWarning, + stacklevel=2, ) - index_type = index_type.upper() - if index_type == "VECTOR" or index_type == "IVF_PQ": - config = IvfPq( - distance_type=metric, - num_partitions=num_partitions, - num_sub_vectors=num_sub_vectors, - num_bits=num_bits, - ) - elif index_type == "IVF_RQ": - config = IvfRq( - distance_type=metric, - num_partitions=num_partitions, - num_bits=num_bits, - ) - elif index_type == "IVF_SQ": - config = IvfSq(distance_type=metric, num_partitions=num_partitions) - elif index_type == "IVF_HNSW_PQ": - raise ValueError( - "IVF_HNSW_PQ is not supported on LanceDB cloud." - "Please use IVF_HNSW_SQ instead." - ) - elif index_type == "IVF_HNSW_SQ": - config = HnswSq(distance_type=metric, num_partitions=num_partitions) - elif index_type == "IVF_HNSW_FLAT": - config = HnswFlat(distance_type=metric, num_partitions=num_partitions) - elif index_type == "IVF_FLAT": - config = IvfFlat(distance_type=metric, num_partitions=num_partitions) + column = vector_column_name + + if accelerator is not None: + logging.warning( + "GPU accelerator is not yet supported on LanceDB cloud." + "If you have 100M+ vectors to index," + "please contact us at contact@lancedb.com" + ) + if replace is not None: + logging.warning( + "replace is not supported on LanceDB cloud." + "Existing indexes will always be replaced." + ) + + idx_type = index_type.upper() + if idx_type == "VECTOR" or idx_type == "IVF_PQ": + config = IvfPq( + distance_type=metric, + num_partitions=num_partitions, + num_sub_vectors=num_sub_vectors, + num_bits=num_bits, + ) + elif idx_type == "IVF_RQ": + config = IvfRq( + distance_type=metric, + num_partitions=num_partitions, + num_bits=num_bits, + ) + elif idx_type == "IVF_SQ": + config = IvfSq(distance_type=metric, num_partitions=num_partitions) + elif idx_type == "IVF_HNSW_PQ": + raise ValueError( + "IVF_HNSW_PQ is not supported on LanceDB cloud." + "Please use IVF_HNSW_SQ instead." + ) + elif idx_type == "IVF_HNSW_SQ": + config = HnswSq(distance_type=metric, num_partitions=num_partitions) + elif idx_type == "IVF_HNSW_FLAT": + config = HnswFlat(distance_type=metric, num_partitions=num_partitions) + elif idx_type == "IVF_FLAT": + config = IvfFlat(distance_type=metric, num_partitions=num_partitions) + else: + raise ValueError( + f"Unknown vector index type: {idx_type}. Valid options are" + " 'IVF_FLAT', 'IVF_PQ', 'IVF_RQ', 'IVF_SQ'," + " 'IVF_HNSW_PQ', 'IVF_HNSW_SQ', 'IVF_HNSW_FLAT'" + ) else: - raise ValueError( - f"Unknown vector index type: {index_type}. Valid options are" - " 'IVF_FLAT', 'IVF_PQ', 'IVF_RQ', 'IVF_SQ'," - " 'IVF_HNSW_PQ', 'IVF_HNSW_SQ', 'IVF_HNSW_FLAT'" - ) + column = metric LOOP.run( self._table.create_index( - vector_column_name, + column, config=config, wait_timeout=wait_timeout, name=name, @@ -308,6 +401,37 @@ class RemoteTable(Table): ) ) + def _is_legacy_create_index_call( + self, + first_arg: str, + config: Optional[IndexConfigType], + num_partitions: Optional[int], + num_sub_vectors: Optional[int], + vector_column_name: str, + accelerator: Optional[str], + index_cache_size: Optional[int], + replace: Optional[bool], + ) -> bool: + """Detect if this is a legacy create_index call.""" + if config is not None: + return False + if any( + x is not None + for x in ( + num_partitions, + num_sub_vectors, + accelerator, + index_cache_size, + replace, + ) + ): + return True + if vector_column_name != VECTOR_COLUMN_NAME: + return True + if first_arg.lower() in KNOWN_METRICS: + return True + return False + def add( self, data: DATA, diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 3a9ae0801..407709d17 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -174,6 +174,24 @@ if TYPE_CHECKING: DistanceType, ) +# Type alias for index configuration objects +IndexConfigType = Union[ + IvfFlat, + IvfPq, + IvfSq, + IvfRq, + HnswFlat, + HnswPq, + HnswSq, + BTree, + Bitmap, + LabelList, + FTS, +] + +# Known distance metrics for legacy API detection +KNOWN_METRICS = {"l2", "cosine", "dot", "hamming"} + def _into_pyarrow_reader( data, schema: Optional[pa.Schema] = None @@ -807,11 +825,49 @@ class Table(ABC): """ raise NotImplementedError + # New unified API overload + @overload def create_index( self, - metric="l2", - num_partitions=256, - num_sub_vectors=96, + column: str, + /, + *, + config: IndexConfigType, + replace: bool = ..., + wait_timeout: Optional[timedelta] = ..., + name: Optional[str] = ..., + train: bool = ..., + ) -> None: ... + + # Legacy API overload (deprecated) + @overload + def create_index( + self, + metric: Literal["l2", "cosine", "dot", "hamming"] = ..., + num_partitions: Optional[int] = ..., + num_sub_vectors: Optional[int] = ..., + vector_column_name: str = ..., + replace: bool = ..., + accelerator: Optional[str] = ..., + index_cache_size: Optional[int] = ..., + *, + index_type: VectorIndexType = ..., + wait_timeout: Optional[timedelta] = ..., + num_bits: int = ..., + max_iterations: int = ..., + sample_rate: int = ..., + m: int = ..., + ef_construction: int = ..., + name: Optional[str] = ..., + train: bool = ..., + target_partition_size: Optional[int] = ..., + ) -> None: ... + + def create_index( + self, + metric: DistanceType = "l2", + num_partitions: Optional[int] = None, + num_sub_vectors: Optional[int] = None, vector_column_name: str = VECTOR_COLUMN_NAME, replace: bool = True, accelerator: Optional[str] = None, @@ -824,46 +880,53 @@ class Table(ABC): sample_rate: int = 256, m: int = 20, ef_construction: int = 300, + config: Optional[IndexConfigType] = None, name: Optional[str] = None, train: bool = True, target_partition_size: Optional[int] = None, ): - """Create an index on the table. + """Create an index on a column. + + This method supports both the new unified API and the legacy API + for backwards compatibility. The new API takes the column name as the + first positional argument and an index configuration object via + ``config``; the legacy API takes the distance metric as the first + argument plus separate ``vector_column_name`` / ``num_partitions`` / + etc. parameters, and emits a ``DeprecationWarning``. Parameters ---------- - metric: str, default "l2" - The distance metric to use when creating the index. - Valid values are "l2", "cosine", "dot", or "hamming". - l2 is euclidean distance. - Hamming is available only for binary vectors. - num_partitions: int, default 256 - The number of IVF partitions to use when creating the index. - Default is 256. - num_sub_vectors: int, default 96 - The number of PQ sub-vectors to use when creating the index. - Default is 96. - vector_column_name: str, default "vector" - The vector column name to create the index. - replace: bool, default True - - If True, replace the existing index if it exists. + metric : str + For new API: the column name to index. + For legacy API: the distance metric ("l2", "cosine", "dot", "hamming"). + config : IndexConfigType, optional + The index configuration object. If provided, uses the new unified API. + Can be one of: IvfFlat, IvfPq, IvfSq, IvfRq, HnswPq, HnswSq, + BTree, Bitmap, LabelList, FTS. + replace : bool, default True + Whether to replace an existing index on this column. + wait_timeout : timedelta, optional + Timeout to wait for async indexing to complete. + name : str, optional + Custom name for the index. + train : bool, default True + Whether to train the index with existing data. - - If False, raise an error if duplicate index exists. - accelerator: str, default None - If set, use the given accelerator to create the index. - Only support "cuda" for now. - index_cache_size : int, optional - The size of the index cache in number of entries. Default value is 256. - num_bits: int - The number of bits to encode sub-vectors. Only used with the IVF_PQ index. - Only 4 and 8 are supported. - wait_timeout: timedelta, optional - The timeout to wait if indexing is asynchronous. - name: str, optional - The name of the index. If not provided, a default name will be generated. - train: bool, default True - Whether to train the index with existing data. Vector indices always train - with existing data. + Examples + -------- + New API (recommended): + + >>> table.create_index( # doctest: +SKIP + ... "vector", config=IvfPq(distance_type="l2") + ... ) + >>> table.create_index("category", config=BTree()) # doctest: +SKIP + >>> table.create_index("content", config=FTS()) # doctest: +SKIP + + Legacy API (deprecated): + + >>> table.create_index( # doctest: +SKIP + ... "l2", vector_column_name="vector" + ... ) """ raise NotImplementedError @@ -2250,11 +2313,51 @@ class LanceTable(Table): dataset, allow_pyarrow_filter=False, batch_size=batch_size ) + # New unified API overload + @overload def create_index( self, - metric: DistanceType = "l2", - num_partitions=None, - num_sub_vectors=None, + column: str, + /, + *, + config: IndexConfigType, + replace: bool = ..., + wait_timeout: Optional[timedelta] = ..., + name: Optional[str] = ..., + train: bool = ..., + ) -> None: ... + + # Legacy API overload (deprecated) + @overload + def create_index( + self, + metric: Literal["l2", "cosine", "dot", "hamming"] = ..., + num_partitions: Optional[int] = ..., + num_sub_vectors: Optional[int] = ..., + vector_column_name: str = ..., + replace: bool = ..., + accelerator: Optional[str] = ..., + index_cache_size: Optional[int] = ..., + num_bits: int = ..., + index_type: Literal[ + "IVF_FLAT", "IVF_SQ", "IVF_PQ", "IVF_RQ", "IVF_HNSW_SQ", "IVF_HNSW_PQ" + ] = ..., + max_iterations: int = ..., + sample_rate: int = ..., + m: int = ..., + ef_construction: int = ..., + *, + wait_timeout: Optional[timedelta] = ..., + name: Optional[str] = ..., + train: bool = ..., + target_partition_size: Optional[int] = ..., + ) -> None: ... + + def create_index( + self, + metric: str = "l2", + num_partitions: Optional[int] = None, + num_sub_vectors: Optional[int] = None, vector_column_name: str = VECTOR_COLUMN_NAME, replace: bool = True, accelerator: Optional[str] = None, @@ -2274,47 +2377,232 @@ class LanceTable(Table): m: int = 20, ef_construction: int = 300, *, + config: Optional[IndexConfigType] = None, + wait_timeout: Optional[timedelta] = None, name: Optional[str] = None, train: bool = True, target_partition_size: Optional[int] = None, ): - """Create an index on the table.""" - if accelerator is not None: - # accelerator is only supported through pylance. - self.to_lance().create_index( - column=vector_column_name, - index_type=index_type, + """Create an index on a column. + + This method supports both the new unified API and the legacy API + for backwards compatibility. The new API takes the column name as the + first positional argument and an index configuration object via + ``config``; the legacy API takes the distance metric as the first + argument plus separate ``vector_column_name`` / ``num_partitions`` / + etc. parameters, and emits a ``DeprecationWarning``. + + Parameters + ---------- + metric : str + For new API: the column name to index. + For legacy API: the distance metric ("l2", "cosine", "dot", "hamming"). + config : IndexConfigType, optional + The index configuration object. If provided, uses the new unified API. + Can be one of: IvfFlat, IvfPq, IvfSq, IvfRq, HnswPq, HnswSq, + BTree, Bitmap, LabelList, FTS. + replace : bool, default True + Whether to replace an existing index on this column. + wait_timeout : timedelta, optional + Timeout to wait for async indexing to complete. + name : str, optional + Custom name for the index. + train : bool, default True + Whether to train the index with existing data. + + Examples + -------- + New API (recommended): + + >>> table.create_index( # doctest: +SKIP + ... "vector", config=IvfPq(distance_type="l2") + ... ) + >>> table.create_index("category", config=BTree()) # doctest: +SKIP + >>> table.create_index("content", config=FTS()) # doctest: +SKIP + + Legacy API (deprecated): + + >>> table.create_index( # doctest: +SKIP + ... "l2", vector_column_name="vector" + ... ) + """ + # Detect whether this is a legacy API call + is_legacy = self._is_legacy_create_index_call( + metric, + config, + num_partitions, + num_sub_vectors, + vector_column_name, + accelerator, + index_cache_size, + ) + + if is_legacy: + warnings.warn( + "The create_index() API with metric/num_partitions parameters is " + "deprecated and will be removed in a future version. " + "Please migrate to the new unified API:\n" + " # Old (deprecated):\n" + " table.create_index('l2', vector_column_name='my_vector')\n" + " # New (recommended):\n" + " table.create_index('my_vector', config=IvfPq(distance_type='l2'))", + DeprecationWarning, + stacklevel=2, + ) + + # Legacy API: first arg is the distance metric + column = vector_column_name + + # Build config from legacy parameters + config = self._build_vector_config_from_legacy_params( metric=metric, + index_type=index_type, num_partitions=num_partitions, num_sub_vectors=num_sub_vectors, - replace=replace, - accelerator=accelerator, - index_cache_size=index_cache_size, num_bits=num_bits, + max_iterations=max_iterations, + sample_rate=sample_rate, m=m, ef_construction=ef_construction, target_partition_size=target_partition_size, + accelerator=accelerator, ) - self.checkout_latest() - return - elif index_type == "IVF_FLAT": - config = IvfFlat( + + # Handle accelerator through pylance + if accelerator is not None: + self.to_lance().create_index( + column=column, + index_type=index_type, + metric=metric, + num_partitions=num_partitions, + num_sub_vectors=num_sub_vectors, + replace=replace, + accelerator=accelerator, + index_cache_size=index_cache_size, + num_bits=num_bits, + m=m, + ef_construction=ef_construction, + target_partition_size=target_partition_size, + ) + self.checkout_latest() + return + else: + # New API: metric is the column name + column = metric + + # Check if config has accelerator set and dispatch to pylance + if config is not None and hasattr(config, "accelerator"): + acc = getattr(config, "accelerator", None) + if acc is not None: + # Dispatch to pylance for GPU acceleration + index_type_map = { + "IvfFlat": "IVF_FLAT", + "IvfSq": "IVF_SQ", + "IvfPq": "IVF_PQ", + "IvfRq": "IVF_RQ", + "HnswPq": "IVF_HNSW_PQ", + "HnswSq": "IVF_HNSW_SQ", + } + cfg_type = type(config).__name__ + lance_index_type = index_type_map.get(cfg_type, "IVF_PQ") + + self.to_lance().create_index( + column=column, + index_type=lance_index_type, + metric=getattr(config, "distance_type", "l2"), + num_partitions=getattr(config, "num_partitions", None), + num_sub_vectors=getattr(config, "num_sub_vectors", None), + replace=replace, + accelerator=acc, + num_bits=getattr(config, "num_bits", 8), + m=getattr(config, "m", 20), + ef_construction=getattr(config, "ef_construction", 300), + target_partition_size=getattr( + config, "target_partition_size", None + ), + ) + self.checkout_latest() + return + + return LOOP.run( + self._table.create_index( + column, + replace=replace, + config=config, + wait_timeout=wait_timeout, + name=name, + train=train, + ) + ) + + def _is_legacy_create_index_call( + self, + first_arg: str, + config: Optional[IndexConfigType], + num_partitions: Optional[int], + num_sub_vectors: Optional[int], + vector_column_name: str, + accelerator: Optional[str], + index_cache_size: Optional[int], + ) -> bool: + """Detect if this is a legacy create_index call.""" + # If config is provided, it's definitely the new API + if config is not None: + return False + + # If old-style parameters were explicitly set, it's legacy + if any( + x is not None + for x in (num_partitions, num_sub_vectors, accelerator, index_cache_size) + ): + return True + + # If vector_column_name differs from default, it's legacy + if vector_column_name != VECTOR_COLUMN_NAME: + return True + + # If first arg is a known metric, assume legacy + if first_arg.lower() in KNOWN_METRICS: + return True + + # Otherwise assume new API + return False + + def _build_vector_config_from_legacy_params( + self, + metric: str, + index_type: str, + num_partitions: Optional[int], + num_sub_vectors: Optional[int], + num_bits: int, + max_iterations: int, + sample_rate: int, + m: int, + ef_construction: int, + target_partition_size: Optional[int], + accelerator: Optional[str], + ) -> IndexConfigType: + """Build an index config object from legacy parameters.""" + if index_type == "IVF_FLAT": + return IvfFlat( distance_type=metric, num_partitions=num_partitions, max_iterations=max_iterations, sample_rate=sample_rate, target_partition_size=target_partition_size, + accelerator=accelerator, ) elif index_type == "IVF_SQ": - config = IvfSq( + return IvfSq( distance_type=metric, num_partitions=num_partitions, max_iterations=max_iterations, sample_rate=sample_rate, target_partition_size=target_partition_size, + accelerator=accelerator, ) elif index_type == "IVF_PQ": - config = IvfPq( + return IvfPq( distance_type=metric, num_partitions=num_partitions, num_sub_vectors=num_sub_vectors, @@ -2322,18 +2610,20 @@ class LanceTable(Table): max_iterations=max_iterations, sample_rate=sample_rate, target_partition_size=target_partition_size, + accelerator=accelerator, ) elif index_type == "IVF_RQ": - config = IvfRq( + return IvfRq( distance_type=metric, num_partitions=num_partitions, num_bits=num_bits, max_iterations=max_iterations, sample_rate=sample_rate, target_partition_size=target_partition_size, + accelerator=accelerator, ) elif index_type == "IVF_HNSW_PQ": - config = HnswPq( + return HnswPq( distance_type=metric, num_partitions=num_partitions, num_sub_vectors=num_sub_vectors, @@ -2343,9 +2633,10 @@ class LanceTable(Table): m=m, ef_construction=ef_construction, target_partition_size=target_partition_size, + accelerator=accelerator, ) elif index_type == "IVF_HNSW_SQ": - config = HnswSq( + return HnswSq( distance_type=metric, num_partitions=num_partitions, max_iterations=max_iterations, @@ -2353,9 +2644,10 @@ class LanceTable(Table): m=m, ef_construction=ef_construction, target_partition_size=target_partition_size, + accelerator=accelerator, ) elif index_type == "IVF_HNSW_FLAT": - config = HnswFlat( + return HnswFlat( distance_type=metric, num_partitions=num_partitions, max_iterations=max_iterations, @@ -2367,16 +2659,6 @@ class LanceTable(Table): else: raise ValueError(f"Unknown index type {index_type}") - return LOOP.run( - self._table.create_index( - vector_column_name, - replace=replace, - config=config, - name=name, - train=train, - ) - ) - def drop_index(self, name: str) -> None: """ Drops an index from the table @@ -2476,6 +2758,11 @@ class LanceTable(Table): """ return LOOP.run(self._table.latest_storage_options()) + @deprecation.deprecated( + deprecated_in="0.25.0", + current_version=__version__, + details="Use create_index() with config=BTree()/Bitmap()/LabelList() instead.", + ) def create_scalar_index( self, column: str, @@ -2484,6 +2771,12 @@ class LanceTable(Table): index_type: ScalarIndexType = "BTREE", name: Optional[str] = None, ): + """Create a scalar index on a column. + + .. deprecated:: 0.25.0 + Use :meth:`create_index` with a BTree, Bitmap, or LabelList config instead. + Example: ``table.create_index("column", config=BTree())`` + """ if index_type == "BTREE": config = BTree() elif index_type == "BITMAP": @@ -2496,6 +2789,11 @@ class LanceTable(Table): self._table.create_index(column, replace=replace, config=config, name=name) ) + @deprecation.deprecated( + deprecated_in="0.25.0", + current_version=__version__, + details="Use create_index() with config=FTS() instead.", + ) def create_fts_index( self, field_names: Union[str, List[str]], @@ -2519,6 +2817,12 @@ class LanceTable(Table): prefix_only: bool = False, name: Optional[str] = None, ): + """Create a full-text search index on a column. + + .. deprecated:: 0.25.0 + Use :meth:`create_index` with an FTS config instead. + Example: ``table.create_index("text_column", config=FTS())`` + """ self._ensure_no_legacy_fts_index() if use_tantivy: diff --git a/python/python/tests/test_fts.py b/python/python/tests/test_fts.py index 62f8f93d3..db83cb678 100644 --- a/python/python/tests/test_fts.py +++ b/python/python/tests/test_fts.py @@ -215,11 +215,12 @@ def test_reject_legacy_tantivy_index(table): @pytest.mark.parametrize("with_position", [True, False]) def test_create_inverted_index(table, with_position): - table.create_fts_index( - "text", - with_position=with_position, - name="custom_fts_index", - ) + with pytest.warns(DeprecationWarning, match="create_fts_index"): + table.create_fts_index( + "text", + with_position=with_position, + name="custom_fts_index", + ) indices = table.list_indices() fts_indices = [i for i in indices if i.index_type == "FTS"] assert any(i.name == "custom_fts_index" for i in fts_indices) diff --git a/python/python/tests/test_remote_db.py b/python/python/tests/test_remote_db.py index c50cf29f9..ab49d76d4 100644 --- a/python/python/tests/test_remote_db.py +++ b/python/python/tests/test_remote_db.py @@ -436,22 +436,25 @@ def test_table_create_indices(): # This is a smoke-test. table = db.create_table("test", [{"id": 1}]) - # Test create_scalar_index with custom name - table.create_scalar_index( - "id", wait_timeout=timedelta(seconds=2), name="custom_scalar_idx" - ) + # Test create_scalar_index with custom name (legacy method) + with pytest.warns(DeprecationWarning, match="create_scalar_index"): + table.create_scalar_index( + "id", wait_timeout=timedelta(seconds=2), name="custom_scalar_idx" + ) - # Test create_fts_index with custom name - table.create_fts_index( - "text", wait_timeout=timedelta(seconds=2), name="custom_fts_idx" - ) + # Test create_fts_index with custom name (legacy method) + with pytest.warns(DeprecationWarning, match="create_fts_index"): + table.create_fts_index( + "text", wait_timeout=timedelta(seconds=2), name="custom_fts_idx" + ) - # Test create_index with custom name - table.create_index( - vector_column_name="vector", - wait_timeout=timedelta(seconds=10), - name="custom_vector_idx", - ) + # Test create_index with custom name (legacy form: vector_column_name kwarg) + with pytest.warns(DeprecationWarning, match="create_index"): + table.create_index( + vector_column_name="vector", + wait_timeout=timedelta(seconds=10), + name="custom_vector_idx", + ) # Validate that the name parameter was passed correctly in requests assert len(received_requests) == 3 @@ -480,6 +483,68 @@ def test_table_create_indices(): table.drop_index("custom_fts_idx") +def test_remote_create_index_new_api(): + received_requests = [] + + def handler(request): + if request.path == "/v1/table/test/create_index/": + content_len = int(request.headers.get("Content-Length", 0)) + body = request.rfile.read(content_len) if content_len > 0 else b"" + received_requests.append(json.loads(body) if body else {}) + request.send_response(200) + request.end_headers() + elif request.path == "/v1/table/test/create/?mode=create": + request.send_response(200) + request.send_header("Content-Type", "application/json") + request.end_headers() + request.wfile.write(b"{}") + elif request.path == "/v1/table/test/describe/": + request.send_response(200) + request.send_header("Content-Type", "application/json") + request.end_headers() + request.wfile.write( + json.dumps( + dict( + version=1, + schema=dict( + fields=[ + dict(name="id", type={"type": "int64"}, nullable=False) + ] + ), + ) + ).encode() + ) + else: + request.send_response(404) + request.end_headers() + + from lancedb.index import BTree, FTS, IvfPq, IvfRq + + with mock_lancedb_connection(handler) as db: + table = db.create_table("test", [{"id": 1}]) + + # New API: column-first, config= kwarg. Should NOT emit DeprecationWarning. + import warnings as _warnings + + with _warnings.catch_warnings(): + _warnings.simplefilter("error", DeprecationWarning) + table.create_index("vector", config=IvfPq(distance_type="l2")) + table.create_index("category", config=BTree()) + table.create_index("text", config=FTS()) + # IvfRq via new API + table.create_index("vector", config=IvfRq(distance_type="l2")) + + # Legacy index_type="IVF_RQ" routes to IvfRq config under the hood. + with pytest.warns(DeprecationWarning, match="create_index"): + table.create_index( + vector_column_name="vector", + index_type="IVF_RQ", + num_partitions=8, + ) + + assert len(received_requests) == 5 + + def test_table_wait_for_index_timeout(): def handler(request): index_stats = dict( diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py index ed4656d81..2a07c2df6 100644 --- a/python/python/tests/test_table.py +++ b/python/python/tests/test_table.py @@ -4,6 +4,7 @@ import os import sys +import warnings from datetime import date, datetime, timedelta from time import sleep from typing import List @@ -11,7 +12,7 @@ from unittest.mock import patch import lancedb from lancedb.dependencies import _PANDAS_AVAILABLE -from lancedb.index import HnswFlat, HnswPq, HnswSq, IvfPq +from lancedb.index import BTree, FTS, HnswFlat, HnswPq, HnswSq, IvfPq import numpy as np import polars as pl import pyarrow as pa @@ -928,7 +929,12 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection): num_bits=4, ) mock_create_index.assert_called_with( - "vector", replace=True, config=expected_config, name=None, train=True + "vector", + replace=True, + config=expected_config, + wait_timeout=None, + name=None, + train=True, ) # Test with target_partition_size @@ -948,7 +954,12 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection): target_partition_size=8192, ) mock_create_index.assert_called_with( - "vector", replace=True, config=expected_config, name=None, train=True + "vector", + replace=True, + config=expected_config, + wait_timeout=None, + name=None, + train=True, ) # target_partition_size has a default value, @@ -967,7 +978,12 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection): num_bits=4, ) mock_create_index.assert_called_with( - "vector", replace=True, config=expected_config, name=None, train=True + "vector", + replace=True, + config=expected_config, + wait_timeout=None, + name=None, + train=True, ) table.create_index( @@ -978,7 +994,12 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection): ) expected_config = HnswPq(distance_type="dot") mock_create_index.assert_called_with( - "my_vector", replace=False, config=expected_config, name=None, train=True + "my_vector", + replace=False, + config=expected_config, + wait_timeout=None, + name=None, + train=True, ) table.create_index( @@ -993,7 +1014,12 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection): distance_type="cosine", sample_rate=0.1, m=29, ef_construction=10 ) mock_create_index.assert_called_with( - "my_vector", replace=True, config=expected_config, name=None, train=True + "my_vector", + replace=True, + config=expected_config, + wait_timeout=None, + name=None, + train=True, ) table.create_index( @@ -1008,7 +1034,12 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection): distance_type="cosine", sample_rate=0.1, m=29, ef_construction=10 ) mock_create_index.assert_called_with( - "my_vector", replace=True, config=expected_config, name=None, train=True + "my_vector", + replace=True, + config=expected_config, + wait_timeout=None, + name=None, + train=True, ) @@ -1032,6 +1063,7 @@ def test_create_index_name_and_train_parameters( "vector", replace=True, config=expected_config, + wait_timeout=None, name="my_custom_index", train=True, ) @@ -1039,13 +1071,82 @@ def test_create_index_name_and_train_parameters( # Test with train=False table.create_index(vector_column_name="vector", train=False) mock_create_index.assert_called_with( - "vector", replace=True, config=expected_config, name=None, train=False + "vector", + replace=True, + config=expected_config, + wait_timeout=None, + name=None, + train=False, ) # Test with both name and train table.create_index(vector_column_name="vector", name="my_index_name", train=True) mock_create_index.assert_called_with( - "vector", replace=True, config=expected_config, name="my_index_name", train=True + "vector", + replace=True, + config=expected_config, + wait_timeout=None, + name="my_index_name", + train=True, + ) + + +@patch("lancedb.table.AsyncTable.create_index") +def test_create_index_legacy_emits_deprecation_warning( + mock_create_index, mem_db: DBConnection +): + table = mem_db.create_table( + "test", + data=[{"vector": [3.1, 4.1]}, {"vector": [5.9, 26.5]}], + ) + + with pytest.warns(DeprecationWarning, match="create_index"): + table.create_index(metric="l2", num_partitions=8, vector_column_name="vector") + + +@patch("lancedb.table.AsyncTable.create_index") +def test_create_index_new_api(mock_create_index, mem_db: DBConnection): + table = mem_db.create_table( + "test", + data=[ + {"vector": [3.1, 4.1], "category": "a", "text": "hello world"}, + {"vector": [5.9, 26.5], "category": "b", "text": "goodbye"}, + ], + ) + + # Vector index via new API should not warn + with warnings.catch_warnings(): + warnings.simplefilter("error", DeprecationWarning) + table.create_index("vector", config=IvfPq(distance_type="l2")) + mock_create_index.assert_called_with( + "vector", + replace=True, + config=IvfPq(distance_type="l2"), + wait_timeout=None, + name=None, + train=True, + ) + + # Scalar index via new API + table.create_index("category", config=BTree()) + mock_create_index.assert_called_with( + "category", + replace=True, + config=BTree(), + wait_timeout=None, + name=None, + train=True, + ) + + # FTS index via new API + table.create_index("text", config=FTS(with_position=True)) + mock_create_index.assert_called_with( + "text", + replace=True, + config=FTS(with_position=True), + wait_timeout=None, + name=None, + train=True, ) @@ -1861,8 +1962,9 @@ def test_create_scalar_index(mem_db: DBConnection): "my_table", data=test_data, ) - # Test with default name - table.create_scalar_index("x") + # Test with default name; confirm DeprecationWarning fires + with pytest.warns(DeprecationWarning, match="create_scalar_index"): + table.create_scalar_index("x") indices = table.list_indices() assert len(indices) == 1 scalar_index = indices[0]