lancedb/python/lancedb/remote/table.py

#  Copyright 2023 LanceDB Developers
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

import uuid
from functools import cached_property
from typing import Optional, Union

import pyarrow as pa
from lance import json_to_schema

from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME

from ..query import LanceVectorQueryBuilder
from ..table import Query, Table, _sanitize_data
from .arrow import to_ipc_binary
from .client import ARROW_STREAM_CONTENT_TYPE
from .db import RemoteDBConnection


class RemoteTable(Table):
    def __init__(self, conn: RemoteDBConnection, name: str):
        self._conn = conn
        self._name = name

    def __repr__(self) -> str:
        return f"RemoteTable({self._conn.db_name}.{self._name})"

    @cached_property
    def schema(self) -> pa.Schema:
        """The [Arrow Schema](https://arrow.apache.org/docs/python/api/datatypes.html#)
        of this Table

        """
        resp = self._conn._loop.run_until_complete(
            self._conn._client.post(f"/v1/table/{self._name}/describe/")
        )
        schema = json_to_schema(resp["schema"])
        return schema

    @property
    def version(self) -> int:
        """Get the current version of the table"""
        resp = self._conn._loop.run_until_complete(
            self._conn._client.post(f"/v1/table/{self._name}/describe/")
        )
        return resp["version"]

    def to_arrow(self) -> pa.Table:
        """to_arrow() is not supported on the LanceDB cloud

        """
        raise NotImplementedError("to_arrow() is not supported on the LanceDB cloud")

    def to_pandas(self):
        """to_pandas() is not supported on the LanceDB cloud

        """
        return NotImplementedError("to_pandas() is not supported on the LanceDB cloud")

    def create_index(
        self,
        metric="L2",
        vector_column_name: str = VECTOR_COLUMN_NAME,
        index_cache_size: Optional[int] = None,
    ):
        """Create an index on the table.
        Currently, the only parameters that matter are
        the metric and the vector column name.

        Parameters
        ----------
        metric : str
            The metric to use for the index. Default is "L2".
        vector_column_name : str
            The name of the vector column. Default is "vector".

        Examples
        --------
        >>> import lancedb
        >>> import uuid
        >>> from lancedb.schema import vector
        >>> conn = lancedb.connect("db://...", api_key="...", region="...")
        >>> table_name = uuid.uuid4().hex
        >>> schema = pa.schema(
        ...     [
        ...             pa.field("id", pa.uint32(), False),
        ...            pa.field("vector", vector(128), False),
        ...             pa.field("s", pa.string(), False),
        ...     ]
        ... )
        >>> table = conn.create_table(
        >>>     table_name,
        >>>     schema=schema,
        >>> )
        >>> table.create_index("L2", "vector")
        """
        index_type = "vector"

        data = {
            "column": vector_column_name,
            "index_type": index_type,
            "metric_type": metric,
            "index_cache_size": index_cache_size,
        }
        resp = self._conn._loop.run_until_complete(
            self._conn._client.post(f"/v1/table/{self._name}/create_index/", data=data)
        )
        return resp

    def add(
        self,
        data: DATA,
        mode: str = "append",
        on_bad_vectors: str = "error",
        fill_value: float = 0.0,
    ) -> int:
        """Add more data to the [Table](Table). It has the same API signature as the OSS version.

        Parameters
        ----------
        data: DATA
            The data to insert into the table. Acceptable types are:

            - dict or list-of-dict

            - pandas.DataFrame

            - pyarrow.Table or pyarrow.RecordBatch
        mode: str
            The mode to use when writing the data. Valid values are
            "append" and "overwrite".
        on_bad_vectors: str, default "error"
            What to do if any of the vectors are not the same size or contains NaNs.
            One of "error", "drop", "fill".
        fill_value: float, default 0.
            The value to use when filling vectors. Only used if on_bad_vectors="fill".

        """
        data = _sanitize_data(
            data,
            self.schema,
            metadata=None,
            on_bad_vectors=on_bad_vectors,
            fill_value=fill_value,
        )
        payload = to_ipc_binary(data)

        request_id = uuid.uuid4().hex

        self._conn._loop.run_until_complete(
            self._conn._client.post(
                f"/v1/table/{self._name}/insert/",
                data=payload,
                params={"request_id": request_id, "mode": mode},
                content_type=ARROW_STREAM_CONTENT_TYPE,
            )
        )

    def search(
        self, query: Union[VEC, str], vector_column_name: str = VECTOR_COLUMN_NAME
    ) -> LanceVectorQueryBuilder:
        """Create a search query to find the nearest neighbors
        of the given query vector. We currently support [vector search][search]

        All query options are defined in [Query][lancedb.query.Query].

        Examples
        --------
        >>> import lancedb
        >>> db = lancedb.connect("db://...", api_key="...", region="...")
        >>> data = [
        ...    {"original_width": 100, "caption": "bar", "vector": [0.1, 2.3, 4.5]},
        ...    {"original_width": 2000, "caption": "foo",  "vector": [0.5, 3.4, 1.3]},
        ...    {"original_width": 3000, "caption": "test", "vector": [0.3, 6.2, 2.6]}
        ... ]
        >>> table = db.create_table("my_table", data)
        >>> query = [0.4, 1.4, 2.4]
        >>> (table.search(query, vector_column_name="vector")
        ...     .where("original_width > 1000", prefilter=True)
        ...     .select(["caption", "original_width"])
        ...     .limit(2)
        ...     .to_pandas())
          caption  original_width           vector  _distance
        0     foo            2000  [0.5, 3.4, 1.3]   5.220000
        1    test            3000  [0.3, 6.2, 2.6]  23.089996

        Parameters
        ----------
        query: list/np.ndarray/str/PIL.Image.Image, default None
            The targetted vector to search for.

            - *default None*.
            Acceptable types are: list, np.ndarray, PIL.Image.Image

            - If None then the select/where/limit clauses are applied to filter
            the table
        vector_column_name: str
            The name of the vector column to search.
            *default "vector"*

        Returns
        -------
        LanceQueryBuilder
            A query builder object representing the query.
            Once executed, the query returns

            - selected columns

            - the vector

            - and also the "_distance" column which is the distance between the query
            vector and the returned vector.
        """
        return LanceVectorQueryBuilder(self, query, vector_column_name)

    def _execute_query(self, query: Query) -> pa.Table:
        result = self._conn._client.query(self._name, query)
        return self._conn._loop.run_until_complete(result).to_arrow()

    def delete(self, predicate: str):
        """Delete rows from the table.

        This can be used to delete a single row, many rows, all rows, or
        sometimes no rows (if your predicate matches nothing).

        Parameters
        ----------
        predicate: str
            The SQL where clause to use when deleting rows.

            - For example, 'x = 2' or 'x IN (1, 2, 3)'.

            The filter must not be empty, or it will error.

        Examples
        --------
        >>> import lancedb
        >>> data = [
        ...    {"x": 1, "vector": [1, 2]},
        ...    {"x": 2, "vector": [3, 4]},
        ...    {"x": 3, "vector": [5, 6]}
        ... ]
        >>> db = lancedb.connect("db://...", api_key="...", region="...")
        >>> table = db.create_table("my_table", data)
        >>> table.search([10,10]).to_pandas()
           x      vector  _distance
        0  3  [5.0, 6.0]       41.0
        1  2  [3.0, 4.0]       85.0
        2  1  [1.0, 2.0]      145.0
        >>> table.delete("x = 2")
        >>> table.search([10,10]).to_pandas()
           x      vector  _distance
        0  3  [5.0, 6.0]       41.0
        1  1  [1.0, 2.0]      145.0

        If you have a list of values to delete, you can combine them into a
        stringified list and use the `IN` operator:

        >>> to_remove = [1, 3]
        >>> to_remove = ", ".join([str(v) for v in to_remove])
        >>> to_remove
        '1, 3'
        >>> table.delete(f"x IN ({to_remove})")
        >>> table.search([10,10]).to_pandas()
           x      vector  _distance
        0  2  [3.0, 4.0]       85.0
        """
        payload = {"predicate": predicate}
        self._conn._loop.run_until_complete(
            self._conn._client.post(f"/v1/table/{self._name}/delete/", data=payload)
        )