Compare commits

...

2 Commits

Author SHA1 Message Date
qzhu
8e25e0c7f0 reformatted 2023-12-07 12:08:05 -08:00
qzhu
5f989e86d2 SaaS python SDK doc 2023-12-07 12:01:03 -08:00
5 changed files with 262 additions and 43 deletions

View File

@@ -146,7 +146,8 @@ nav:
- Serverless Chatbot from any website: examples/serverless_website_chatbot.md
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
- API references:
- Python API: python/python.md
- OSS Python API: python/python.md
- SaaS Python API: python/saas-python.md
- Javascript API: javascript/modules.md
- LanceDB Cloud↗: https://noteforms.com/forms/lancedb-mailing-list-cloud-kty1o5?notionforms=1&utm_source=notionforms

View File

@@ -0,0 +1,18 @@
# LanceDB Python API Reference
## Installation
```shell
pip install lancedb
```
## Connection
::: lancedb.connect
::: lancedb.remote.db.RemoteDBConnection
## Table
::: lancedb.remote.table.RemoteTable

View File

@@ -27,7 +27,7 @@ def connect(
uri: URI,
*,
api_key: Optional[str] = None,
region: str = "us-west-2",
region: str = "us-east-1",
host_override: Optional[str] = None,
) -> DBConnection:
"""Connect to a LanceDB database.
@@ -39,7 +39,7 @@ def connect(
api_key: str, optional
If presented, connect to LanceDB cloud.
Otherwise, connect to a database on file system or cloud storage.
region: str, default "us-west-2"
region: str, default "us-east-1"
The region to use for LanceDB Cloud.
host_override: str, optional
The override url for LanceDB Cloud.

View File

@@ -59,13 +59,17 @@ class RemoteDBConnection(DBConnection):
return f"RemoveConnect(name={self.db_name})"
@override
def table_names(self, page_token: Optional[str] = None, limit=10) -> Iterable[str]:
def table_names(
self, page_token: Optional[str] = None, limit: int = 10
) -> Iterable[str]:
"""List the names of all tables in the database.
Parameters
----------
page_token: str
The last token to start the new page.
limit: int, default 10
The maximum number of tables to return for each page.
Returns
-------
@@ -120,6 +124,97 @@ class RemoteDBConnection(DBConnection):
fill_value: float = 0.0,
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
) -> Table:
"""Create a [Table][lancedb.table.Table] in the database.
Parameters
----------
name: str
The name of the table.
data: The data to initialize the table, *optional*
User must provide at least one of `data` or `schema`.
Acceptable types are:
- dict or list-of-dict
- pandas.DataFrame
- pyarrow.Table or pyarrow.RecordBatch
schema: The schema of the table, *optional*
Acceptable types are:
- pyarrow.Schema
- [LanceModel][lancedb.pydantic.LanceModel]
on_bad_vectors: str, default "error"
What to do if any of the vectors are not the same size or contains NaNs.
One of "error", "drop", "fill".
fill_value: float
The value to use when filling vectors. Only used if on_bad_vectors="fill".
Returns
-------
LanceTable
A reference to the newly created table.
!!! note
The vector index won't be created by default.
To create the index, call the `create_index` method on the table.
Examples
--------
Can create with list of tuples or dictionaries:
>>> import lancedb
>>> db = lancedb.connect("db://test-project-8f45eb")
>>> data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
... {"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1}]
>>> db.create_table("my_table", data)
LanceTable(my_table)
You can also pass a pandas DataFrame:
>>> import pandas as pd
>>> data = pd.DataFrame({
... "vector": [[1.1, 1.2], [0.2, 1.8]],
... "lat": [45.5, 40.1],
... "long": [-122.7, -74.1]
... })
>>> db.create_table("table2", data)
LanceTable(table2)
>>> custom_schema = pa.schema([
... pa.field("vector", pa.list_(pa.float32(), 2)),
... pa.field("lat", pa.float32()),
... pa.field("long", pa.float32())
... ])
>>> db.create_table("table3", data, schema = custom_schema)
LanceTable(table3)
It is also possible to create an table from `[Iterable[pa.RecordBatch]]`:
>>> import pyarrow as pa
>>> def make_batches():
... for i in range(5):
... yield pa.RecordBatch.from_arrays(
... [
... pa.array([[3.1, 4.1], [5.9, 26.5]],
... pa.list_(pa.float32(), 2)),
... pa.array(["foo", "bar"]),
... pa.array([10.0, 20.0]),
... ],
... ["vector", "item", "price"],
... )
>>> schema=pa.schema([
... pa.field("vector", pa.list_(pa.float32(), 2)),
... pa.field("item", pa.utf8()),
... pa.field("price", pa.float32()),
... ])
>>> db.create_table("table4", make_batches(), schema=schema)
LanceTable(table4)
"""
if data is None and schema is None:
raise ValueError("Either data or schema must be provided.")
if embedding_functions is not None:

View File

@@ -37,7 +37,10 @@ class RemoteTable(Table):
@cached_property
def schema(self) -> pa.Schema:
"""Return the schema of the table."""
"""The [Arrow Schema](https://arrow.apache.org/docs/python/api/datatypes.html#)
of this Table
"""
resp = self._conn._loop.run_until_complete(
self._conn._client.post(f"/v1/table/{self._name}/describe/")
)
@@ -53,24 +56,17 @@ class RemoteTable(Table):
return resp["version"]
def to_arrow(self) -> pa.Table:
"""Return the table as an Arrow table."""
"""to_arrow() is not supported on the LanceDB cloud"""
raise NotImplementedError("to_arrow() is not supported on the LanceDB cloud")
def to_pandas(self):
"""Return the table as a Pandas DataFrame.
Intercept `to_arrow()` for better error message.
"""
"""to_pandas() is not supported on the LanceDB cloud"""
return NotImplementedError("to_pandas() is not supported on the LanceDB cloud")
def create_index(
self,
metric="L2",
num_partitions=256,
num_sub_vectors=96,
vector_column_name: str = VECTOR_COLUMN_NAME,
replace: bool = True,
accelerator: Optional[str] = None,
index_cache_size: Optional[int] = None,
):
"""Create an index on the table.
@@ -81,39 +77,28 @@ class RemoteTable(Table):
----------
metric : str
The metric to use for the index. Default is "L2".
num_partitions : int
The number of partitions to use for the index. Default is 256.
num_sub_vectors : int
The number of sub-vectors to use for the index. Default is 96.
vector_column_name : str
The name of the vector column. Default is "vector".
replace : bool
Whether to replace the existing index. Default is True.
accelerator : str, optional
If set, use the given accelerator to create the index.
Default is None. Currently not supported.
index_cache_size : int, optional
The size of the index cache in number of entries. Default value is 256.
Examples
--------
import lancedb
import uuid
from lancedb.schema import vector
conn = lancedb.connect("db://...", api_key="...", region="...")
table_name = uuid.uuid4().hex
schema = pa.schema(
[
pa.field("id", pa.uint32(), False),
pa.field("vector", vector(128), False),
pa.field("s", pa.string(), False),
]
)
table = conn.create_table(
table_name,
schema=schema,
)
table.create_index()
>>> import lancedb
>>> import uuid
>>> from lancedb.schema import vector
>>> conn = lancedb.connect("db://...", api_key="...", region="...")
>>> table_name = uuid.uuid4().hex
>>> schema = pa.schema(
... [
... pa.field("id", pa.uint32(), False),
... pa.field("vector", vector(128), False),
... pa.field("s", pa.string(), False),
... ]
... )
>>> table = conn.create_table(
>>> table_name,
>>> schema=schema,
>>> )
>>> table.create_index("L2", "vector")
"""
index_type = "vector"
@@ -135,6 +120,28 @@ class RemoteTable(Table):
on_bad_vectors: str = "error",
fill_value: float = 0.0,
) -> int:
"""Add more data to the [Table](Table). It has the same API signature as the OSS version.
Parameters
----------
data: DATA
The data to insert into the table. Acceptable types are:
- dict or list-of-dict
- pandas.DataFrame
- pyarrow.Table or pyarrow.RecordBatch
mode: str
The mode to use when writing the data. Valid values are
"append" and "overwrite".
on_bad_vectors: str, default "error"
What to do if any of the vectors are not the same size or contains NaNs.
One of "error", "drop", "fill".
fill_value: float, default 0.
The value to use when filling vectors. Only used if on_bad_vectors="fill".
"""
data = _sanitize_data(
data,
self.schema,
@@ -158,6 +165,58 @@ class RemoteTable(Table):
def search(
self, query: Union[VEC, str], vector_column_name: str = VECTOR_COLUMN_NAME
) -> LanceVectorQueryBuilder:
"""Create a search query to find the nearest neighbors
of the given query vector. We currently support [vector search][search]
All query options are defined in [Query][lancedb.query.Query].
Examples
--------
>>> import lancedb
>>> db = lancedb.connect("db://...", api_key="...", region="...")
>>> data = [
... {"original_width": 100, "caption": "bar", "vector": [0.1, 2.3, 4.5]},
... {"original_width": 2000, "caption": "foo", "vector": [0.5, 3.4, 1.3]},
... {"original_width": 3000, "caption": "test", "vector": [0.3, 6.2, 2.6]}
... ]
>>> table = db.create_table("my_table", data)
>>> query = [0.4, 1.4, 2.4]
>>> (table.search(query, vector_column_name="vector")
... .where("original_width > 1000", prefilter=True)
... .select(["caption", "original_width"])
... .limit(2)
... .to_pandas())
caption original_width vector _distance
0 foo 2000 [0.5, 3.4, 1.3] 5.220000
1 test 3000 [0.3, 6.2, 2.6] 23.089996
Parameters
----------
query: list/np.ndarray/str/PIL.Image.Image, default None
The targetted vector to search for.
- *default None*.
Acceptable types are: list, np.ndarray, PIL.Image.Image
- If None then the select/where/limit clauses are applied to filter
the table
vector_column_name: str
The name of the vector column to search.
*default "vector"*
Returns
-------
LanceQueryBuilder
A query builder object representing the query.
Once executed, the query returns
- selected columns
- the vector
- and also the "_distance" column which is the distance between the query
vector and the returned vector.
"""
return LanceVectorQueryBuilder(self, query, vector_column_name)
def _execute_query(self, query: Query) -> pa.Table:
@@ -165,7 +224,53 @@ class RemoteTable(Table):
return self._conn._loop.run_until_complete(result).to_arrow()
def delete(self, predicate: str):
"""Delete rows from the table."""
"""Delete rows from the table.
This can be used to delete a single row, many rows, all rows, or
sometimes no rows (if your predicate matches nothing).
Parameters
----------
predicate: str
The SQL where clause to use when deleting rows.
- For example, 'x = 2' or 'x IN (1, 2, 3)'.
The filter must not be empty, or it will error.
Examples
--------
>>> import lancedb
>>> data = [
... {"x": 1, "vector": [1, 2]},
... {"x": 2, "vector": [3, 4]},
... {"x": 3, "vector": [5, 6]}
... ]
>>> db = lancedb.connect("db://...", api_key="...", region="...")
>>> table = db.create_table("my_table", data)
>>> table.search([10,10]).to_pandas()
x vector _distance
0 3 [5.0, 6.0] 41.0
1 2 [3.0, 4.0] 85.0
2 1 [1.0, 2.0] 145.0
>>> table.delete("x = 2")
>>> table.search([10,10]).to_pandas()
x vector _distance
0 3 [5.0, 6.0] 41.0
1 1 [1.0, 2.0] 145.0
If you have a list of values to delete, you can combine them into a
stringified list and use the `IN` operator:
>>> to_remove = [1, 3]
>>> to_remove = ", ".join([str(v) for v in to_remove])
>>> to_remove
'1, 3'
>>> table.delete(f"x IN ({to_remove})")
>>> table.search([10,10]).to_pandas()
x vector _distance
0 2 [3.0, 4.0] 85.0
"""
payload = {"predicate": predicate}
self._conn._loop.run_until_complete(
self._conn._client.post(f"/v1/table/{self._name}/delete/", data=payload)