reformatted

SaaS python SDK doc
2025-12-23 05:19:58 +00:00 · 2023-12-07 12:08:05 -08:00 · 2023-12-07 12:01:03 -08:00
5 changed files with 262 additions and 43 deletions
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -146,7 +146,8 @@ nav:
  - Serverless Chatbot from any website: examples/serverless_website_chatbot.md
  - TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
 - API references:
-  - Python API: python/python.md
+  - OSS Python API: python/python.md
+  - SaaS Python API: python/saas-python.md
  - Javascript API: javascript/modules.md
 - LanceDB Cloud↗: https://noteforms.com/forms/lancedb-mailing-list-cloud-kty1o5?notionforms=1&utm_source=notionforms

--- a/docs/src/python/saas-python.md
+++ b/docs/src/python/saas-python.md
@@ -0,0 +1,18 @@
+# LanceDB Python API Reference
+
+## Installation
+
+```shell
+pip install lancedb
+```
+
+## Connection
+
+::: lancedb.connect
+
+::: lancedb.remote.db.RemoteDBConnection
+
+## Table
+
+::: lancedb.remote.table.RemoteTable
+
--- a/python/lancedb/init.py
+++ b/python/lancedb/init.py
@@ -27,7 +27,7 @@ def connect(
    uri: URI,
    *,
    api_key: Optional[str] = None,
-    region: str = "us-west-2",
+    region: str = "us-east-1",
    host_override: Optional[str] = None,
 ) -> DBConnection:
    """Connect to a LanceDB database.
@@ -39,7 +39,7 @@ def connect(
    api_key: str, optional
        If presented, connect to LanceDB cloud.
        Otherwise, connect to a database on file system or cloud storage.
-    region: str, default "us-west-2"
+    region: str, default "us-east-1"
        The region to use for LanceDB Cloud.
    host_override: str, optional
        The override url for LanceDB Cloud.
--- a/python/lancedb/remote/db.py
+++ b/python/lancedb/remote/db.py
@@ -59,13 +59,17 @@ class RemoteDBConnection(DBConnection):
        return f"RemoveConnect(name={self.db_name})"

    @override
-    def table_names(self, page_token: Optional[str] = None, limit=10) -> Iterable[str]:
+    def table_names(
+        self, page_token: Optional[str] = None, limit: int = 10
+    ) -> Iterable[str]:
        """List the names of all tables in the database.

        Parameters
        ----------
        page_token: str
            The last token to start the new page.
+        limit: int, default 10
+            The maximum number of tables to return for each page.

        Returns
        -------
@@ -120,6 +124,97 @@ class RemoteDBConnection(DBConnection):
        fill_value: float = 0.0,
        embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
    ) -> Table:
+        """Create a [Table][lancedb.table.Table] in the database.
+
+        Parameters
+        ----------
+        name: str
+            The name of the table.
+        data: The data to initialize the table, *optional*
+            User must provide at least one of `data` or `schema`.
+            Acceptable types are:
+
+            - dict or list-of-dict
+
+            - pandas.DataFrame
+
+            - pyarrow.Table or pyarrow.RecordBatch
+        schema: The schema of the table, *optional*
+            Acceptable types are:
+
+            - pyarrow.Schema
+
+            - [LanceModel][lancedb.pydantic.LanceModel]
+        on_bad_vectors: str, default "error"
+            What to do if any of the vectors are not the same size or contains NaNs.
+            One of "error", "drop", "fill".
+        fill_value: float
+            The value to use when filling vectors. Only used if on_bad_vectors="fill".
+
+        Returns
+        -------
+        LanceTable
+            A reference to the newly created table.
+
+        !!! note
+
+            The vector index won't be created by default.
+            To create the index, call the `create_index` method on the table.
+
+        Examples
+        --------
+
+        Can create with list of tuples or dictionaries:
+
+        >>> import lancedb
+        >>> db = lancedb.connect("db://test-project-8f45eb")
+        >>> data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
+        ...         {"vector": [0.2, 1.8], "lat": 40.1, "long":  -74.1}]
+        >>> db.create_table("my_table", data)
+        LanceTable(my_table)
+
+        You can also pass a pandas DataFrame:
+
+        >>> import pandas as pd
+        >>> data = pd.DataFrame({
+        ...    "vector": [[1.1, 1.2], [0.2, 1.8]],
+        ...    "lat": [45.5, 40.1],
+        ...    "long": [-122.7, -74.1]
+        ... })
+        >>> db.create_table("table2", data)
+        LanceTable(table2)
+
+        >>> custom_schema = pa.schema([
+        ...   pa.field("vector", pa.list_(pa.float32(), 2)),
+        ...   pa.field("lat", pa.float32()),
+        ...   pa.field("long", pa.float32())
+        ... ])
+        >>> db.create_table("table3", data, schema = custom_schema)
+        LanceTable(table3)
+
+        It is also possible to create an table from `[Iterable[pa.RecordBatch]]`:
+
+        >>> import pyarrow as pa
+        >>> def make_batches():
+        ...     for i in range(5):
+        ...         yield pa.RecordBatch.from_arrays(
+        ...             [
+        ...                 pa.array([[3.1, 4.1], [5.9, 26.5]],
+        ...                     pa.list_(pa.float32(), 2)),
+        ...                 pa.array(["foo", "bar"]),
+        ...                 pa.array([10.0, 20.0]),
+        ...             ],
+        ...             ["vector", "item", "price"],
+        ...         )
+        >>> schema=pa.schema([
+        ...     pa.field("vector", pa.list_(pa.float32(), 2)),
+        ...     pa.field("item", pa.utf8()),
+        ...     pa.field("price", pa.float32()),
+        ... ])
+        >>> db.create_table("table4", make_batches(), schema=schema)
+        LanceTable(table4)
+
+        """
        if data is None and schema is None:
            raise ValueError("Either data or schema must be provided.")
        if embedding_functions is not None:
--- a/python/lancedb/remote/table.py
+++ b/python/lancedb/remote/table.py
@@ -37,7 +37,10 @@ class RemoteTable(Table):

    @cached_property
    def schema(self) -> pa.Schema:
-        """Return the schema of the table."""
+        """The [Arrow Schema](https://arrow.apache.org/docs/python/api/datatypes.html#)
+        of this Table
+
+        """
        resp = self._conn._loop.run_until_complete(
            self._conn._client.post(f"/v1/table/{self._name}/describe/")
        )
@@ -53,24 +56,17 @@ class RemoteTable(Table):
        return resp["version"]

    def to_arrow(self) -> pa.Table:
-        """Return the table as an Arrow table."""
+        """to_arrow() is not supported on the LanceDB cloud"""
        raise NotImplementedError("to_arrow() is not supported on the LanceDB cloud")

    def to_pandas(self):
-        """Return the table as a Pandas DataFrame.
-
-        Intercept `to_arrow()` for better error message.
-        """
+        """to_pandas() is not supported on the LanceDB cloud"""
        return NotImplementedError("to_pandas() is not supported on the LanceDB cloud")

    def create_index(
        self,
        metric="L2",
-        num_partitions=256,
-        num_sub_vectors=96,
        vector_column_name: str = VECTOR_COLUMN_NAME,
-        replace: bool = True,
-        accelerator: Optional[str] = None,
        index_cache_size: Optional[int] = None,
    ):
        """Create an index on the table.
@@ -81,39 +77,28 @@ class RemoteTable(Table):
        ----------
        metric : str
            The metric to use for the index. Default is "L2".
-        num_partitions : int
-            The number of partitions to use for the index. Default is 256.
-        num_sub_vectors : int
-            The number of sub-vectors to use for the index. Default is 96.
        vector_column_name : str
            The name of the vector column. Default is "vector".
-        replace : bool
-            Whether to replace the existing index. Default is True.
-        accelerator : str, optional
-            If set, use the given accelerator to create the index.
-            Default is None. Currently not supported.
-        index_cache_size : int, optional
-            The size of the index cache in number of entries. Default value is 256.

        Examples
        --------
-        import lancedb
-        import uuid
-        from lancedb.schema import vector
-        conn = lancedb.connect("db://...", api_key="...", region="...")
-        table_name = uuid.uuid4().hex
-        schema = pa.schema(
-            [
-                    pa.field("id", pa.uint32(), False),
-                    pa.field("vector", vector(128), False),
-                    pa.field("s", pa.string(), False),
-            ]
-        )
-        table = conn.create_table(
-            table_name,
-            schema=schema,
-        )
-        table.create_index()
+        >>> import lancedb
+        >>> import uuid
+        >>> from lancedb.schema import vector
+        >>> conn = lancedb.connect("db://...", api_key="...", region="...")
+        >>> table_name = uuid.uuid4().hex
+        >>> schema = pa.schema(
+        ...     [
+        ...             pa.field("id", pa.uint32(), False),
+        ...            pa.field("vector", vector(128), False),
+        ...             pa.field("s", pa.string(), False),
+        ...     ]
+        ... )
+        >>> table = conn.create_table(
+        >>>     table_name,
+        >>>     schema=schema,
+        >>> )
+        >>> table.create_index("L2", "vector")
        """
        index_type = "vector"

@@ -135,6 +120,28 @@ class RemoteTable(Table):
        on_bad_vectors: str = "error",
        fill_value: float = 0.0,
    ) -> int:
+        """Add more data to the [Table](Table). It has the same API signature as the OSS version.
+
+        Parameters
+        ----------
+        data: DATA
+            The data to insert into the table. Acceptable types are:
+
+            - dict or list-of-dict
+
+            - pandas.DataFrame
+
+            - pyarrow.Table or pyarrow.RecordBatch
+        mode: str
+            The mode to use when writing the data. Valid values are
+            "append" and "overwrite".
+        on_bad_vectors: str, default "error"
+            What to do if any of the vectors are not the same size or contains NaNs.
+            One of "error", "drop", "fill".
+        fill_value: float, default 0.
+            The value to use when filling vectors. Only used if on_bad_vectors="fill".
+
+        """
        data = _sanitize_data(
            data,
            self.schema,
@@ -158,6 +165,58 @@ class RemoteTable(Table):
    def search(
        self, query: Union[VEC, str], vector_column_name: str = VECTOR_COLUMN_NAME
    ) -> LanceVectorQueryBuilder:
+        """Create a search query to find the nearest neighbors
+        of the given query vector. We currently support [vector search][search]
+
+        All query options are defined in [Query][lancedb.query.Query].
+
+        Examples
+        --------
+        >>> import lancedb
+        >>> db = lancedb.connect("db://...", api_key="...", region="...")
+        >>> data = [
+        ...    {"original_width": 100, "caption": "bar", "vector": [0.1, 2.3, 4.5]},
+        ...    {"original_width": 2000, "caption": "foo",  "vector": [0.5, 3.4, 1.3]},
+        ...    {"original_width": 3000, "caption": "test", "vector": [0.3, 6.2, 2.6]}
+        ... ]
+        >>> table = db.create_table("my_table", data)
+        >>> query = [0.4, 1.4, 2.4]
+        >>> (table.search(query, vector_column_name="vector")
+        ...     .where("original_width > 1000", prefilter=True)
+        ...     .select(["caption", "original_width"])
+        ...     .limit(2)
+        ...     .to_pandas())
+          caption  original_width           vector  _distance
+        0     foo            2000  [0.5, 3.4, 1.3]   5.220000
+        1    test            3000  [0.3, 6.2, 2.6]  23.089996
+
+        Parameters
+        ----------
+        query: list/np.ndarray/str/PIL.Image.Image, default None
+            The targetted vector to search for.
+
+            - *default None*.
+            Acceptable types are: list, np.ndarray, PIL.Image.Image
+
+            - If None then the select/where/limit clauses are applied to filter
+            the table
+        vector_column_name: str
+            The name of the vector column to search.
+            *default "vector"*
+
+        Returns
+        -------
+        LanceQueryBuilder
+            A query builder object representing the query.
+            Once executed, the query returns
+
+            - selected columns
+
+            - the vector
+
+            - and also the "_distance" column which is the distance between the query
+            vector and the returned vector.
+        """
        return LanceVectorQueryBuilder(self, query, vector_column_name)

    def _execute_query(self, query: Query) -> pa.Table:
@@ -165,7 +224,53 @@ class RemoteTable(Table):
        return self._conn._loop.run_until_complete(result).to_arrow()

    def delete(self, predicate: str):
-        """Delete rows from the table."""
+        """Delete rows from the table.
+
+        This can be used to delete a single row, many rows, all rows, or
+        sometimes no rows (if your predicate matches nothing).
+
+        Parameters
+        ----------
+        predicate: str
+            The SQL where clause to use when deleting rows.
+
+            - For example, 'x = 2' or 'x IN (1, 2, 3)'.
+
+            The filter must not be empty, or it will error.
+
+        Examples
+        --------
+        >>> import lancedb
+        >>> data = [
+        ...    {"x": 1, "vector": [1, 2]},
+        ...    {"x": 2, "vector": [3, 4]},
+        ...    {"x": 3, "vector": [5, 6]}
+        ... ]
+        >>> db = lancedb.connect("db://...", api_key="...", region="...")
+        >>> table = db.create_table("my_table", data)
+        >>> table.search([10,10]).to_pandas()
+           x      vector  _distance
+        0  3  [5.0, 6.0]       41.0
+        1  2  [3.0, 4.0]       85.0
+        2  1  [1.0, 2.0]      145.0
+        >>> table.delete("x = 2")
+        >>> table.search([10,10]).to_pandas()
+           x      vector  _distance
+        0  3  [5.0, 6.0]       41.0
+        1  1  [1.0, 2.0]      145.0
+
+        If you have a list of values to delete, you can combine them into a
+        stringified list and use the `IN` operator:
+
+        >>> to_remove = [1, 3]
+        >>> to_remove = ", ".join([str(v) for v in to_remove])
+        >>> to_remove
+        '1, 3'
+        >>> table.delete(f"x IN ({to_remove})")
+        >>> table.search([10,10]).to_pandas()
+           x      vector  _distance
+        0  2  [3.0, 4.0]       85.0
+        """
        payload = {"predicate": predicate}
        self._conn._loop.run_until_complete(
            self._conn._client.post(f"/v1/table/{self._name}/delete/", data=payload)
Author	SHA1	Message	Date
qzhu	8e25e0c7f0	reformatted	2023-12-07 12:08:05 -08:00
qzhu	5f989e86d2	SaaS python SDK doc	2023-12-07 12:01:03 -08:00