reformatted

SaaS python SDK doc
2025-12-23 21:39:57 +00:00 · 2023-12-07 12:08:05 -08:00 · 2023-12-07 12:01:03 -08:00
5 changed files with 262 additions and 43 deletions
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -146,7 +146,8 @@ nav:
  - Serverless Chatbot from any website: examples/serverless_website_chatbot.md
  - TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
 - API references:
-  - Python API: python/python.md
+  - OSS Python API: python/python.md
  - SaaS Python API: python/saas-python.md
  - Javascript API: javascript/modules.md
 - LanceDB Cloud↗: https://noteforms.com/forms/lancedb-mailing-list-cloud-kty1o5?notionforms=1&utm_source=notionforms
--- a/docs/src/python/saas-python.md
+++ b/docs/src/python/saas-python.md
@@ -0,0 +1,18 @@
 # LanceDB Python API Reference
 ## Installation
 ```shell
 pip install lancedb
 ```
 ## Connection
 ::: lancedb.connect
 ::: lancedb.remote.db.RemoteDBConnection
 ## Table
 ::: lancedb.remote.table.RemoteTable
--- a/python/lancedb/init.py
+++ b/python/lancedb/init.py
@@ -27,7 +27,7 @@ def connect(
    uri: URI,
    *,
    api_key: Optional[str] = None,
-    region: str = "us-west-2",
+    region: str = "us-east-1",
    host_override: Optional[str] = None,
 ) -> DBConnection:
    """Connect to a LanceDB database.
@@ -39,7 +39,7 @@ def connect(
    api_key: str, optional
        If presented, connect to LanceDB cloud.
        Otherwise, connect to a database on file system or cloud storage.
-    region: str, default "us-west-2"
+    region: str, default "us-east-1"
        The region to use for LanceDB Cloud.
    host_override: str, optional
        The override url for LanceDB Cloud.
--- a/python/lancedb/remote/db.py
+++ b/python/lancedb/remote/db.py
@@ -59,13 +59,17 @@ class RemoteDBConnection(DBConnection):
        return f"RemoveConnect(name={self.db_name})"
    @override
-    def table_names(self, page_token: Optional[str] = None, limit=10) -> Iterable[str]:
+    def table_names(
        self, page_token: Optional[str] = None, limit: int = 10
    ) -> Iterable[str]:
        """List the names of all tables in the database.
        Parameters
        ----------
        page_token: str
            The last token to start the new page.
        limit: int, default 10
            The maximum number of tables to return for each page.
        Returns
        -------
@@ -120,6 +124,97 @@ class RemoteDBConnection(DBConnection):
        fill_value: float = 0.0,
        embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
    ) -> Table:
        """Create a [Table][lancedb.table.Table] in the database.
        Parameters
        ----------
        name: str
            The name of the table.
        data: The data to initialize the table, *optional*
            User must provide at least one of `data` or `schema`.
            Acceptable types are:
            - dict or list-of-dict
            - pandas.DataFrame
            - pyarrow.Table or pyarrow.RecordBatch
        schema: The schema of the table, *optional*
            Acceptable types are:
            - pyarrow.Schema
            - [LanceModel][lancedb.pydantic.LanceModel]
        on_bad_vectors: str, default "error"
            What to do if any of the vectors are not the same size or contains NaNs.
            One of "error", "drop", "fill".
        fill_value: float
            The value to use when filling vectors. Only used if on_bad_vectors="fill".
        Returns
        -------
        LanceTable
            A reference to the newly created table.
        !!! note
            The vector index won't be created by default.
            To create the index, call the `create_index` method on the table.
        Examples
        --------
        Can create with list of tuples or dictionaries:
        >>> import lancedb
        >>> db = lancedb.connect("db://test-project-8f45eb")
        >>> data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
        ...         {"vector": [0.2, 1.8], "lat": 40.1, "long":  -74.1}]
        >>> db.create_table("my_table", data)
        LanceTable(my_table)
        You can also pass a pandas DataFrame:
        >>> import pandas as pd
        >>> data = pd.DataFrame({
        ...    "vector": [[1.1, 1.2], [0.2, 1.8]],
        ...    "lat": [45.5, 40.1],
        ...    "long": [-122.7, -74.1]
        ... })
        >>> db.create_table("table2", data)
        LanceTable(table2)
        >>> custom_schema = pa.schema([
        ...   pa.field("vector", pa.list_(pa.float32(), 2)),
        ...   pa.field("lat", pa.float32()),
        ...   pa.field("long", pa.float32())
        ... ])
        >>> db.create_table("table3", data, schema = custom_schema)
        LanceTable(table3)
        It is also possible to create an table from `[Iterable[pa.RecordBatch]]`:
        >>> import pyarrow as pa
        >>> def make_batches():
        ...     for i in range(5):
        ...         yield pa.RecordBatch.from_arrays(
        ...             [
        ...                 pa.array([[3.1, 4.1], [5.9, 26.5]],
        ...                     pa.list_(pa.float32(), 2)),
        ...                 pa.array(["foo", "bar"]),
        ...                 pa.array([10.0, 20.0]),
        ...             ],
        ...             ["vector", "item", "price"],
        ...         )
        >>> schema=pa.schema([
        ...     pa.field("vector", pa.list_(pa.float32(), 2)),
        ...     pa.field("item", pa.utf8()),
        ...     pa.field("price", pa.float32()),
        ... ])
        >>> db.create_table("table4", make_batches(), schema=schema)
        LanceTable(table4)
        """
        if data is None and schema is None:
            raise ValueError("Either data or schema must be provided.")
        if embedding_functions is not None:
--- a/python/lancedb/remote/table.py
+++ b/python/lancedb/remote/table.py
@@ -37,7 +37,10 @@ class RemoteTable(Table):
    @cached_property
    def schema(self) -> pa.Schema:
-        """Return the schema of the table."""
+        """The [Arrow Schema](https://arrow.apache.org/docs/python/api/datatypes.html#)
        of this Table
        """
        resp = self._conn._loop.run_until_complete(
            self._conn._client.post(f"/v1/table/{self._name}/describe/")
        )
@@ -53,24 +56,17 @@ class RemoteTable(Table):
        return resp["version"]
    def to_arrow(self) -> pa.Table:
-        """Return the table as an Arrow table."""
+        """to_arrow() is not supported on the LanceDB cloud"""
        raise NotImplementedError("to_arrow() is not supported on the LanceDB cloud")
    def to_pandas(self):
-        """Return the table as a Pandas DataFrame.
+        """to_pandas() is not supported on the LanceDB cloud"""
        Intercept `to_arrow()` for better error message.
        """
        return NotImplementedError("to_pandas() is not supported on the LanceDB cloud")
    def create_index(
        self,
        metric="L2",
        num_partitions=256,
        num_sub_vectors=96,
        vector_column_name: str = VECTOR_COLUMN_NAME,
        replace: bool = True,
        accelerator: Optional[str] = None,
        index_cache_size: Optional[int] = None,
    ):
        """Create an index on the table.
@@ -81,39 +77,28 @@ class RemoteTable(Table):
        ----------
        metric : str
            The metric to use for the index. Default is "L2".
        num_partitions : int
            The number of partitions to use for the index. Default is 256.
        num_sub_vectors : int
            The number of sub-vectors to use for the index. Default is 96.
        vector_column_name : str
            The name of the vector column. Default is "vector".
        replace : bool
            Whether to replace the existing index. Default is True.
        accelerator : str, optional
            If set, use the given accelerator to create the index.
            Default is None. Currently not supported.
        index_cache_size : int, optional
            The size of the index cache in number of entries. Default value is 256.
        Examples
        --------
-        import lancedb
+        >>> import lancedb
-        import uuid
+        >>> import uuid
-        from lancedb.schema import vector
+        >>> from lancedb.schema import vector
-        conn = lancedb.connect("db://...", api_key="...", region="...")
+        >>> conn = lancedb.connect("db://...", api_key="...", region="...")
-        table_name = uuid.uuid4().hex
+        >>> table_name = uuid.uuid4().hex
-        schema = pa.schema(
+        >>> schema = pa.schema(
-            [
+        ...     [
-                    pa.field("id", pa.uint32(), False),
+        ...             pa.field("id", pa.uint32(), False),
-                    pa.field("vector", vector(128), False),
+        ...            pa.field("vector", vector(128), False),
-                    pa.field("s", pa.string(), False),
+        ...             pa.field("s", pa.string(), False),
-            ]
+        ...     ]
-        )
+        ... )
-        table = conn.create_table(
+        >>> table = conn.create_table(
-            table_name,
+        >>>     table_name,
-            schema=schema,
+        >>>     schema=schema,
-        )
+        >>> )
-        table.create_index()
+        >>> table.create_index("L2", "vector")
        """
        index_type = "vector"
@@ -135,6 +120,28 @@ class RemoteTable(Table):
        on_bad_vectors: str = "error",
        fill_value: float = 0.0,
    ) -> int:
        """Add more data to the [Table](Table). It has the same API signature as the OSS version.
        Parameters
        ----------
        data: DATA
            The data to insert into the table. Acceptable types are:
            - dict or list-of-dict
            - pandas.DataFrame
            - pyarrow.Table or pyarrow.RecordBatch
        mode: str
            The mode to use when writing the data. Valid values are
            "append" and "overwrite".
        on_bad_vectors: str, default "error"
            What to do if any of the vectors are not the same size or contains NaNs.
            One of "error", "drop", "fill".
        fill_value: float, default 0.
            The value to use when filling vectors. Only used if on_bad_vectors="fill".
        """
        data = _sanitize_data(
            data,
            self.schema,
@@ -158,6 +165,58 @@ class RemoteTable(Table):
    def search(
        self, query: Union[VEC, str], vector_column_name: str = VECTOR_COLUMN_NAME
    ) -> LanceVectorQueryBuilder:
        """Create a search query to find the nearest neighbors
        of the given query vector. We currently support [vector search][search]
        All query options are defined in [Query][lancedb.query.Query].
        Examples
        --------
        >>> import lancedb
        >>> db = lancedb.connect("db://...", api_key="...", region="...")
        >>> data = [
        ...    {"original_width": 100, "caption": "bar", "vector": [0.1, 2.3, 4.5]},
        ...    {"original_width": 2000, "caption": "foo",  "vector": [0.5, 3.4, 1.3]},
        ...    {"original_width": 3000, "caption": "test", "vector": [0.3, 6.2, 2.6]}
        ... ]
        >>> table = db.create_table("my_table", data)
        >>> query = [0.4, 1.4, 2.4]
        >>> (table.search(query, vector_column_name="vector")
        ...     .where("original_width > 1000", prefilter=True)
        ...     .select(["caption", "original_width"])
        ...     .limit(2)
        ...     .to_pandas())
          caption  original_width           vector  _distance
        0     foo            2000  [0.5, 3.4, 1.3]   5.220000
        1    test            3000  [0.3, 6.2, 2.6]  23.089996
        Parameters
        ----------
        query: list/np.ndarray/str/PIL.Image.Image, default None
            The targetted vector to search for.
            - *default None*.
            Acceptable types are: list, np.ndarray, PIL.Image.Image
            - If None then the select/where/limit clauses are applied to filter
            the table
        vector_column_name: str
            The name of the vector column to search.
            *default "vector"*
        Returns
        -------
        LanceQueryBuilder
            A query builder object representing the query.
            Once executed, the query returns
            - selected columns
            - the vector
            - and also the "_distance" column which is the distance between the query
            vector and the returned vector.
        """
        return LanceVectorQueryBuilder(self, query, vector_column_name)
    def _execute_query(self, query: Query) -> pa.Table:
@@ -165,7 +224,53 @@ class RemoteTable(Table):
        return self._conn._loop.run_until_complete(result).to_arrow()
    def delete(self, predicate: str):
-        """Delete rows from the table."""
+        """Delete rows from the table.
        This can be used to delete a single row, many rows, all rows, or
        sometimes no rows (if your predicate matches nothing).
        Parameters
        ----------
        predicate: str
            The SQL where clause to use when deleting rows.
            - For example, 'x = 2' or 'x IN (1, 2, 3)'.
            The filter must not be empty, or it will error.
        Examples
        --------
        >>> import lancedb
        >>> data = [
        ...    {"x": 1, "vector": [1, 2]},
        ...    {"x": 2, "vector": [3, 4]},
        ...    {"x": 3, "vector": [5, 6]}
        ... ]
        >>> db = lancedb.connect("db://...", api_key="...", region="...")
        >>> table = db.create_table("my_table", data)
        >>> table.search([10,10]).to_pandas()
           x      vector  _distance
        0  3  [5.0, 6.0]       41.0
        1  2  [3.0, 4.0]       85.0
        2  1  [1.0, 2.0]      145.0
        >>> table.delete("x = 2")
        >>> table.search([10,10]).to_pandas()
           x      vector  _distance
        0  3  [5.0, 6.0]       41.0
        1  1  [1.0, 2.0]      145.0
        If you have a list of values to delete, you can combine them into a
        stringified list and use the `IN` operator:
        >>> to_remove = [1, 3]
        >>> to_remove = ", ".join([str(v) for v in to_remove])
        >>> to_remove
        '1, 3'
        >>> table.delete(f"x IN ({to_remove})")
        >>> table.search([10,10]).to_pandas()
           x      vector  _distance
        0  2  [3.0, 4.0]       85.0
        """
        payload = {"predicate": predicate}
        self._conn._loop.run_until_complete(
            self._conn._client.post(f"/v1/table/{self._name}/delete/", data=payload)
Author	SHA1	Message	Date
qzhu	8e25e0c7f0	reformatted	2023-12-07 12:08:05 -08:00
qzhu	5f989e86d2	SaaS python SDK doc	2023-12-07 12:01:03 -08:00