diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 811cf894..67d228ba 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -144,7 +144,8 @@ nav: - Serverless Chatbot from any website: examples/serverless_website_chatbot.md - TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md - API references: - - Python API: python/python.md + - OSS Python API: python/python.md + - SaaS Python API: python/saas-python.md - Javascript API: javascript/modules.md - LanceDB Cloud↗: https://noteforms.com/forms/lancedb-mailing-list-cloud-kty1o5?notionforms=1&utm_source=notionforms diff --git a/docs/src/python/saas-python.md b/docs/src/python/saas-python.md new file mode 100644 index 00000000..52247d46 --- /dev/null +++ b/docs/src/python/saas-python.md @@ -0,0 +1,18 @@ +# LanceDB Python API Reference + +## Installation + +```shell +pip install lancedb +``` + +## Connection + +::: lancedb.connect + +::: lancedb.remote.db.RemoteDBConnection + +## Table + +::: lancedb.remote.table.RemoteTable + diff --git a/python/lancedb/__init__.py b/python/lancedb/__init__.py index a1bbb332..bef5136c 100644 --- a/python/lancedb/__init__.py +++ b/python/lancedb/__init__.py @@ -26,7 +26,7 @@ def connect( uri: URI, *, api_key: Optional[str] = None, - region: str = "us-west-2", + region: str = "us-east-1", host_override: Optional[str] = None, ) -> DBConnection: """Connect to a LanceDB database. @@ -38,7 +38,7 @@ def connect( api_key: str, optional If presented, connect to LanceDB cloud. Otherwise, connect to a database on file system or cloud storage. - region: str, default "us-west-2" + region: str, default "us-east-1" The region to use for LanceDB Cloud. host_override: str, optional The override url for LanceDB Cloud. diff --git a/python/lancedb/remote/db.py b/python/lancedb/remote/db.py index 855ebf50..b512922b 100644 --- a/python/lancedb/remote/db.py +++ b/python/lancedb/remote/db.py @@ -59,13 +59,17 @@ class RemoteDBConnection(DBConnection): return f"RemoveConnect(name={self.db_name})" @override - def table_names(self, page_token: Optional[str] = None, limit=10) -> Iterable[str]: + def table_names( + self, page_token: Optional[str] = None, limit: int = 10 + ) -> Iterable[str]: """List the names of all tables in the database. Parameters ---------- page_token: str The last token to start the new page. + limit: int, default 10 + The maximum number of tables to return for each page. Returns ------- @@ -120,6 +124,97 @@ class RemoteDBConnection(DBConnection): fill_value: float = 0.0, embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None, ) -> Table: + """Create a [Table][lancedb.table.Table] in the database. + + Parameters + ---------- + name: str + The name of the table. + data: The data to initialize the table, *optional* + User must provide at least one of `data` or `schema`. + Acceptable types are: + + - dict or list-of-dict + + - pandas.DataFrame + + - pyarrow.Table or pyarrow.RecordBatch + schema: The schema of the table, *optional* + Acceptable types are: + + - pyarrow.Schema + + - [LanceModel][lancedb.pydantic.LanceModel] + on_bad_vectors: str, default "error" + What to do if any of the vectors are not the same size or contains NaNs. + One of "error", "drop", "fill". + fill_value: float + The value to use when filling vectors. Only used if on_bad_vectors="fill". + + Returns + ------- + LanceTable + A reference to the newly created table. + + !!! note + + The vector index won't be created by default. + To create the index, call the `create_index` method on the table. + + Examples + -------- + + Can create with list of tuples or dictionaries: + + >>> import lancedb + >>> db = lancedb.connect("db://...", api_key="...", region="...") # doctest: +SKIP + >>> data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7}, + ... {"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1}] + >>> db.create_table("my_table", data) # doctest: +SKIP + LanceTable(my_table) + + You can also pass a pandas DataFrame: + + >>> import pandas as pd + >>> data = pd.DataFrame({ + ... "vector": [[1.1, 1.2], [0.2, 1.8]], + ... "lat": [45.5, 40.1], + ... "long": [-122.7, -74.1] + ... }) + >>> db.create_table("table2", data) # doctest: +SKIP + LanceTable(table2) + + >>> custom_schema = pa.schema([ + ... pa.field("vector", pa.list_(pa.float32(), 2)), + ... pa.field("lat", pa.float32()), + ... pa.field("long", pa.float32()) + ... ]) + >>> db.create_table("table3", data, schema = custom_schema) # doctest: +SKIP + LanceTable(table3) + + It is also possible to create an table from `[Iterable[pa.RecordBatch]]`: + + >>> import pyarrow as pa + >>> def make_batches(): + ... for i in range(5): + ... yield pa.RecordBatch.from_arrays( + ... [ + ... pa.array([[3.1, 4.1], [5.9, 26.5]], + ... pa.list_(pa.float32(), 2)), + ... pa.array(["foo", "bar"]), + ... pa.array([10.0, 20.0]), + ... ], + ... ["vector", "item", "price"], + ... ) + >>> schema=pa.schema([ + ... pa.field("vector", pa.list_(pa.float32(), 2)), + ... pa.field("item", pa.utf8()), + ... pa.field("price", pa.float32()), + ... ]) + >>> db.create_table("table4", make_batches(), schema=schema) # doctest: +SKIP + LanceTable(table4) + + """ if data is None and schema is None: raise ValueError("Either data or schema must be provided.") if embedding_functions is not None: diff --git a/python/lancedb/remote/table.py b/python/lancedb/remote/table.py index cb1a3229..372aa347 100644 --- a/python/lancedb/remote/table.py +++ b/python/lancedb/remote/table.py @@ -37,7 +37,10 @@ class RemoteTable(Table): @cached_property def schema(self) -> pa.Schema: - """Return the schema of the table.""" + """The [Arrow Schema](https://arrow.apache.org/docs/python/api/datatypes.html#) + of this Table + + """ resp = self._conn._loop.run_until_complete( self._conn._client.post(f"/v1/table/{self._name}/describe/") ) @@ -53,24 +56,17 @@ class RemoteTable(Table): return resp["version"] def to_arrow(self) -> pa.Table: - """Return the table as an Arrow table.""" + """to_arrow() is not supported on the LanceDB cloud""" raise NotImplementedError("to_arrow() is not supported on the LanceDB cloud") def to_pandas(self): - """Return the table as a Pandas DataFrame. - - Intercept `to_arrow()` for better error message. - """ + """to_pandas() is not supported on the LanceDB cloud""" return NotImplementedError("to_pandas() is not supported on the LanceDB cloud") def create_index( self, metric="L2", - num_partitions=256, - num_sub_vectors=96, vector_column_name: str = VECTOR_COLUMN_NAME, - replace: bool = True, - accelerator: Optional[str] = None, index_cache_size: Optional[int] = None, ): """Create an index on the table. @@ -81,39 +77,28 @@ class RemoteTable(Table): ---------- metric : str The metric to use for the index. Default is "L2". - num_partitions : int - The number of partitions to use for the index. Default is 256. - num_sub_vectors : int - The number of sub-vectors to use for the index. Default is 96. vector_column_name : str The name of the vector column. Default is "vector". - replace : bool - Whether to replace the existing index. Default is True. - accelerator : str, optional - If set, use the given accelerator to create the index. - Default is None. Currently not supported. - index_cache_size : int, optional - The size of the index cache in number of entries. Default value is 256. Examples -------- - import lancedb - import uuid - from lancedb.schema import vector - conn = lancedb.connect("db://...", api_key="...", region="...") - table_name = uuid.uuid4().hex - schema = pa.schema( - [ - pa.field("id", pa.uint32(), False), - pa.field("vector", vector(128), False), - pa.field("s", pa.string(), False), - ] - ) - table = conn.create_table( - table_name, - schema=schema, - ) - table.create_index() + >>> import lancedb + >>> import uuid + >>> from lancedb.schema import vector + >>> db = lancedb.connect("db://...", api_key="...", region="...") # doctest: +SKIP + >>> table_name = uuid.uuid4().hex + >>> schema = pa.schema( + ... [ + ... pa.field("id", pa.uint32(), False), + ... pa.field("vector", vector(128), False), + ... pa.field("s", pa.string(), False), + ... ] + ... ) + >>> table = db.create_table( # doctest: +SKIP + ... table_name, # doctest: +SKIP + ... schema=schema, # doctest: +SKIP + ... ) + >>> table.create_index("L2", "vector") # doctest: +SKIP """ index_type = "vector" @@ -135,6 +120,28 @@ class RemoteTable(Table): on_bad_vectors: str = "error", fill_value: float = 0.0, ) -> int: + """Add more data to the [Table](Table). It has the same API signature as the OSS version. + + Parameters + ---------- + data: DATA + The data to insert into the table. Acceptable types are: + + - dict or list-of-dict + + - pandas.DataFrame + + - pyarrow.Table or pyarrow.RecordBatch + mode: str + The mode to use when writing the data. Valid values are + "append" and "overwrite". + on_bad_vectors: str, default "error" + What to do if any of the vectors are not the same size or contains NaNs. + One of "error", "drop", "fill". + fill_value: float, default 0. + The value to use when filling vectors. Only used if on_bad_vectors="fill". + + """ data = _sanitize_data( data, self.schema, @@ -158,6 +165,58 @@ class RemoteTable(Table): def search( self, query: Union[VEC, str], vector_column_name: str = VECTOR_COLUMN_NAME ) -> LanceVectorQueryBuilder: + """Create a search query to find the nearest neighbors + of the given query vector. We currently support [vector search][search] + + All query options are defined in [Query][lancedb.query.Query]. + + Examples + -------- + >>> import lancedb + >>> db = lancedb.connect("./.lancedb") + >>> data = [ + ... {"original_width": 100, "caption": "bar", "vector": [0.1, 2.3, 4.5]}, + ... {"original_width": 2000, "caption": "foo", "vector": [0.5, 3.4, 1.3]}, + ... {"original_width": 3000, "caption": "test", "vector": [0.3, 6.2, 2.6]} + ... ] + >>> table = db.create_table("my_table", data) + >>> query = [0.4, 1.4, 2.4] + >>> (table.search(query, vector_column_name="vector") + ... .where("original_width > 1000", prefilter=True) + ... .select(["caption", "original_width"]) + ... .limit(2) + ... .to_pandas()) + caption original_width vector _distance + 0 foo 2000 [0.5, 3.4, 1.3] 5.220000 + 1 test 3000 [0.3, 6.2, 2.6] 23.089996 + + Parameters + ---------- + query: list/np.ndarray/str/PIL.Image.Image, default None + The targetted vector to search for. + + - *default None*. + Acceptable types are: list, np.ndarray, PIL.Image.Image + + - If None then the select/where/limit clauses are applied to filter + the table + vector_column_name: str + The name of the vector column to search. + *default "vector"* + + Returns + ------- + LanceQueryBuilder + A query builder object representing the query. + Once executed, the query returns + + - selected columns + + - the vector + + - and also the "_distance" column which is the distance between the query + vector and the returned vector. + """ return LanceVectorQueryBuilder(self, query, vector_column_name) def _execute_query(self, query: Query) -> pa.Table: @@ -165,7 +224,51 @@ class RemoteTable(Table): return self._conn._loop.run_until_complete(result).to_arrow() def delete(self, predicate: str): - """Delete rows from the table.""" + """Delete rows from the table. + + This can be used to delete a single row, many rows, all rows, or + sometimes no rows (if your predicate matches nothing). + + Parameters + ---------- + predicate: str + The SQL where clause to use when deleting rows. + + - For example, 'x = 2' or 'x IN (1, 2, 3)'. + + The filter must not be empty, or it will error. + + Examples + -------- + >>> import lancedb + >>> data = [ + ... {"x": 1, "vector": [1, 2]}, + ... {"x": 2, "vector": [3, 4]}, + ... {"x": 3, "vector": [5, 6]} + ... ] + >>> db = lancedb.connect("db://my-test", api_key="sk...") # doctest: +SKIP + >>> table = db.create_table("my_table", data) # doctest: +SKIP + >>> table.search([10,10]).to_pandas() # doctest: +SKIP + x vector _distance # doctest: +SKIP + 0 3 [5.0, 6.0] 41.0 # doctest: +SKIP + 1 2 [3.0, 4.0] 85.0 # doctest: +SKIP + 2 1 [1.0, 2.0] 145.0 # doctest: +SKIP + >>> table.delete("x = 2") # doctest: +SKIP + >>> table.search([10,10]).to_pandas() # doctest: +SKIP + x vector _distance # doctest: +SKIP + 0 3 [5.0, 6.0] 41.0 # doctest: +SKIP + 1 1 [1.0, 2.0] 145.0 # doctest: +SKIP + + If you have a list of values to delete, you can combine them into a + stringified list and use the `IN` operator: + + >>> to_remove = [1, 3] # doctest: +SKIP + >>> to_remove = ", ".join([str(v) for v in to_remove]) # doctest: +SKIP + >>> table.delete(f"x IN ({to_remove})") # doctest: +SKIP + >>> table.search([10,10]).to_pandas() # doctest: +SKIP + x vector _distance # doctest: +SKIP + 0 2 [3.0, 4.0] 85.0 # doctest: +SKIP + """ payload = {"predicate": predicate} self._conn._loop.run_until_complete( self._conn._client.post(f"/v1/table/{self._name}/delete/", data=payload)