Compare commits

...

2 Commits

Author SHA1 Message Date
qzhu
8e25e0c7f0 reformatted 2023-12-07 12:08:05 -08:00
qzhu
5f989e86d2 SaaS python SDK doc 2023-12-07 12:01:03 -08:00
5 changed files with 262 additions and 43 deletions

View File

@@ -146,7 +146,8 @@ nav:
- Serverless Chatbot from any website: examples/serverless_website_chatbot.md - Serverless Chatbot from any website: examples/serverless_website_chatbot.md
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md - TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
- API references: - API references:
- Python API: python/python.md - OSS Python API: python/python.md
- SaaS Python API: python/saas-python.md
- Javascript API: javascript/modules.md - Javascript API: javascript/modules.md
- LanceDB Cloud↗: https://noteforms.com/forms/lancedb-mailing-list-cloud-kty1o5?notionforms=1&utm_source=notionforms - LanceDB Cloud↗: https://noteforms.com/forms/lancedb-mailing-list-cloud-kty1o5?notionforms=1&utm_source=notionforms

View File

@@ -0,0 +1,18 @@
# LanceDB Python API Reference
## Installation
```shell
pip install lancedb
```
## Connection
::: lancedb.connect
::: lancedb.remote.db.RemoteDBConnection
## Table
::: lancedb.remote.table.RemoteTable

View File

@@ -27,7 +27,7 @@ def connect(
uri: URI, uri: URI,
*, *,
api_key: Optional[str] = None, api_key: Optional[str] = None,
region: str = "us-west-2", region: str = "us-east-1",
host_override: Optional[str] = None, host_override: Optional[str] = None,
) -> DBConnection: ) -> DBConnection:
"""Connect to a LanceDB database. """Connect to a LanceDB database.
@@ -39,7 +39,7 @@ def connect(
api_key: str, optional api_key: str, optional
If presented, connect to LanceDB cloud. If presented, connect to LanceDB cloud.
Otherwise, connect to a database on file system or cloud storage. Otherwise, connect to a database on file system or cloud storage.
region: str, default "us-west-2" region: str, default "us-east-1"
The region to use for LanceDB Cloud. The region to use for LanceDB Cloud.
host_override: str, optional host_override: str, optional
The override url for LanceDB Cloud. The override url for LanceDB Cloud.

View File

@@ -59,13 +59,17 @@ class RemoteDBConnection(DBConnection):
return f"RemoveConnect(name={self.db_name})" return f"RemoveConnect(name={self.db_name})"
@override @override
def table_names(self, page_token: Optional[str] = None, limit=10) -> Iterable[str]: def table_names(
self, page_token: Optional[str] = None, limit: int = 10
) -> Iterable[str]:
"""List the names of all tables in the database. """List the names of all tables in the database.
Parameters Parameters
---------- ----------
page_token: str page_token: str
The last token to start the new page. The last token to start the new page.
limit: int, default 10
The maximum number of tables to return for each page.
Returns Returns
------- -------
@@ -120,6 +124,97 @@ class RemoteDBConnection(DBConnection):
fill_value: float = 0.0, fill_value: float = 0.0,
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None, embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
) -> Table: ) -> Table:
"""Create a [Table][lancedb.table.Table] in the database.
Parameters
----------
name: str
The name of the table.
data: The data to initialize the table, *optional*
User must provide at least one of `data` or `schema`.
Acceptable types are:
- dict or list-of-dict
- pandas.DataFrame
- pyarrow.Table or pyarrow.RecordBatch
schema: The schema of the table, *optional*
Acceptable types are:
- pyarrow.Schema
- [LanceModel][lancedb.pydantic.LanceModel]
on_bad_vectors: str, default "error"
What to do if any of the vectors are not the same size or contains NaNs.
One of "error", "drop", "fill".
fill_value: float
The value to use when filling vectors. Only used if on_bad_vectors="fill".
Returns
-------
LanceTable
A reference to the newly created table.
!!! note
The vector index won't be created by default.
To create the index, call the `create_index` method on the table.
Examples
--------
Can create with list of tuples or dictionaries:
>>> import lancedb
>>> db = lancedb.connect("db://test-project-8f45eb")
>>> data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
... {"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1}]
>>> db.create_table("my_table", data)
LanceTable(my_table)
You can also pass a pandas DataFrame:
>>> import pandas as pd
>>> data = pd.DataFrame({
... "vector": [[1.1, 1.2], [0.2, 1.8]],
... "lat": [45.5, 40.1],
... "long": [-122.7, -74.1]
... })
>>> db.create_table("table2", data)
LanceTable(table2)
>>> custom_schema = pa.schema([
... pa.field("vector", pa.list_(pa.float32(), 2)),
... pa.field("lat", pa.float32()),
... pa.field("long", pa.float32())
... ])
>>> db.create_table("table3", data, schema = custom_schema)
LanceTable(table3)
It is also possible to create an table from `[Iterable[pa.RecordBatch]]`:
>>> import pyarrow as pa
>>> def make_batches():
... for i in range(5):
... yield pa.RecordBatch.from_arrays(
... [
... pa.array([[3.1, 4.1], [5.9, 26.5]],
... pa.list_(pa.float32(), 2)),
... pa.array(["foo", "bar"]),
... pa.array([10.0, 20.0]),
... ],
... ["vector", "item", "price"],
... )
>>> schema=pa.schema([
... pa.field("vector", pa.list_(pa.float32(), 2)),
... pa.field("item", pa.utf8()),
... pa.field("price", pa.float32()),
... ])
>>> db.create_table("table4", make_batches(), schema=schema)
LanceTable(table4)
"""
if data is None and schema is None: if data is None and schema is None:
raise ValueError("Either data or schema must be provided.") raise ValueError("Either data or schema must be provided.")
if embedding_functions is not None: if embedding_functions is not None:

View File

@@ -37,7 +37,10 @@ class RemoteTable(Table):
@cached_property @cached_property
def schema(self) -> pa.Schema: def schema(self) -> pa.Schema:
"""Return the schema of the table.""" """The [Arrow Schema](https://arrow.apache.org/docs/python/api/datatypes.html#)
of this Table
"""
resp = self._conn._loop.run_until_complete( resp = self._conn._loop.run_until_complete(
self._conn._client.post(f"/v1/table/{self._name}/describe/") self._conn._client.post(f"/v1/table/{self._name}/describe/")
) )
@@ -53,24 +56,17 @@ class RemoteTable(Table):
return resp["version"] return resp["version"]
def to_arrow(self) -> pa.Table: def to_arrow(self) -> pa.Table:
"""Return the table as an Arrow table.""" """to_arrow() is not supported on the LanceDB cloud"""
raise NotImplementedError("to_arrow() is not supported on the LanceDB cloud") raise NotImplementedError("to_arrow() is not supported on the LanceDB cloud")
def to_pandas(self): def to_pandas(self):
"""Return the table as a Pandas DataFrame. """to_pandas() is not supported on the LanceDB cloud"""
Intercept `to_arrow()` for better error message.
"""
return NotImplementedError("to_pandas() is not supported on the LanceDB cloud") return NotImplementedError("to_pandas() is not supported on the LanceDB cloud")
def create_index( def create_index(
self, self,
metric="L2", metric="L2",
num_partitions=256,
num_sub_vectors=96,
vector_column_name: str = VECTOR_COLUMN_NAME, vector_column_name: str = VECTOR_COLUMN_NAME,
replace: bool = True,
accelerator: Optional[str] = None,
index_cache_size: Optional[int] = None, index_cache_size: Optional[int] = None,
): ):
"""Create an index on the table. """Create an index on the table.
@@ -81,39 +77,28 @@ class RemoteTable(Table):
---------- ----------
metric : str metric : str
The metric to use for the index. Default is "L2". The metric to use for the index. Default is "L2".
num_partitions : int
The number of partitions to use for the index. Default is 256.
num_sub_vectors : int
The number of sub-vectors to use for the index. Default is 96.
vector_column_name : str vector_column_name : str
The name of the vector column. Default is "vector". The name of the vector column. Default is "vector".
replace : bool
Whether to replace the existing index. Default is True.
accelerator : str, optional
If set, use the given accelerator to create the index.
Default is None. Currently not supported.
index_cache_size : int, optional
The size of the index cache in number of entries. Default value is 256.
Examples Examples
-------- --------
import lancedb >>> import lancedb
import uuid >>> import uuid
from lancedb.schema import vector >>> from lancedb.schema import vector
conn = lancedb.connect("db://...", api_key="...", region="...") >>> conn = lancedb.connect("db://...", api_key="...", region="...")
table_name = uuid.uuid4().hex >>> table_name = uuid.uuid4().hex
schema = pa.schema( >>> schema = pa.schema(
[ ... [
pa.field("id", pa.uint32(), False), ... pa.field("id", pa.uint32(), False),
pa.field("vector", vector(128), False), ... pa.field("vector", vector(128), False),
pa.field("s", pa.string(), False), ... pa.field("s", pa.string(), False),
] ... ]
) ... )
table = conn.create_table( >>> table = conn.create_table(
table_name, >>> table_name,
schema=schema, >>> schema=schema,
) >>> )
table.create_index() >>> table.create_index("L2", "vector")
""" """
index_type = "vector" index_type = "vector"
@@ -135,6 +120,28 @@ class RemoteTable(Table):
on_bad_vectors: str = "error", on_bad_vectors: str = "error",
fill_value: float = 0.0, fill_value: float = 0.0,
) -> int: ) -> int:
"""Add more data to the [Table](Table). It has the same API signature as the OSS version.
Parameters
----------
data: DATA
The data to insert into the table. Acceptable types are:
- dict or list-of-dict
- pandas.DataFrame
- pyarrow.Table or pyarrow.RecordBatch
mode: str
The mode to use when writing the data. Valid values are
"append" and "overwrite".
on_bad_vectors: str, default "error"
What to do if any of the vectors are not the same size or contains NaNs.
One of "error", "drop", "fill".
fill_value: float, default 0.
The value to use when filling vectors. Only used if on_bad_vectors="fill".
"""
data = _sanitize_data( data = _sanitize_data(
data, data,
self.schema, self.schema,
@@ -158,6 +165,58 @@ class RemoteTable(Table):
def search( def search(
self, query: Union[VEC, str], vector_column_name: str = VECTOR_COLUMN_NAME self, query: Union[VEC, str], vector_column_name: str = VECTOR_COLUMN_NAME
) -> LanceVectorQueryBuilder: ) -> LanceVectorQueryBuilder:
"""Create a search query to find the nearest neighbors
of the given query vector. We currently support [vector search][search]
All query options are defined in [Query][lancedb.query.Query].
Examples
--------
>>> import lancedb
>>> db = lancedb.connect("db://...", api_key="...", region="...")
>>> data = [
... {"original_width": 100, "caption": "bar", "vector": [0.1, 2.3, 4.5]},
... {"original_width": 2000, "caption": "foo", "vector": [0.5, 3.4, 1.3]},
... {"original_width": 3000, "caption": "test", "vector": [0.3, 6.2, 2.6]}
... ]
>>> table = db.create_table("my_table", data)
>>> query = [0.4, 1.4, 2.4]
>>> (table.search(query, vector_column_name="vector")
... .where("original_width > 1000", prefilter=True)
... .select(["caption", "original_width"])
... .limit(2)
... .to_pandas())
caption original_width vector _distance
0 foo 2000 [0.5, 3.4, 1.3] 5.220000
1 test 3000 [0.3, 6.2, 2.6] 23.089996
Parameters
----------
query: list/np.ndarray/str/PIL.Image.Image, default None
The targetted vector to search for.
- *default None*.
Acceptable types are: list, np.ndarray, PIL.Image.Image
- If None then the select/where/limit clauses are applied to filter
the table
vector_column_name: str
The name of the vector column to search.
*default "vector"*
Returns
-------
LanceQueryBuilder
A query builder object representing the query.
Once executed, the query returns
- selected columns
- the vector
- and also the "_distance" column which is the distance between the query
vector and the returned vector.
"""
return LanceVectorQueryBuilder(self, query, vector_column_name) return LanceVectorQueryBuilder(self, query, vector_column_name)
def _execute_query(self, query: Query) -> pa.Table: def _execute_query(self, query: Query) -> pa.Table:
@@ -165,7 +224,53 @@ class RemoteTable(Table):
return self._conn._loop.run_until_complete(result).to_arrow() return self._conn._loop.run_until_complete(result).to_arrow()
def delete(self, predicate: str): def delete(self, predicate: str):
"""Delete rows from the table.""" """Delete rows from the table.
This can be used to delete a single row, many rows, all rows, or
sometimes no rows (if your predicate matches nothing).
Parameters
----------
predicate: str
The SQL where clause to use when deleting rows.
- For example, 'x = 2' or 'x IN (1, 2, 3)'.
The filter must not be empty, or it will error.
Examples
--------
>>> import lancedb
>>> data = [
... {"x": 1, "vector": [1, 2]},
... {"x": 2, "vector": [3, 4]},
... {"x": 3, "vector": [5, 6]}
... ]
>>> db = lancedb.connect("db://...", api_key="...", region="...")
>>> table = db.create_table("my_table", data)
>>> table.search([10,10]).to_pandas()
x vector _distance
0 3 [5.0, 6.0] 41.0
1 2 [3.0, 4.0] 85.0
2 1 [1.0, 2.0] 145.0
>>> table.delete("x = 2")
>>> table.search([10,10]).to_pandas()
x vector _distance
0 3 [5.0, 6.0] 41.0
1 1 [1.0, 2.0] 145.0
If you have a list of values to delete, you can combine them into a
stringified list and use the `IN` operator:
>>> to_remove = [1, 3]
>>> to_remove = ", ".join([str(v) for v in to_remove])
>>> to_remove
'1, 3'
>>> table.delete(f"x IN ({to_remove})")
>>> table.search([10,10]).to_pandas()
x vector _distance
0 2 [3.0, 4.0] 85.0
"""
payload = {"predicate": predicate} payload = {"predicate": predicate}
self._conn._loop.run_until_complete( self._conn._loop.run_until_complete(
self._conn._client.post(f"/v1/table/{self._name}/delete/", data=payload) self._conn._client.post(f"/v1/table/{self._name}/delete/", data=payload)