mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 13:29:57 +00:00
Compare commits
2 Commits
python-v0.
...
qian@saas-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8e25e0c7f0 | ||
|
|
5f989e86d2 |
@@ -146,7 +146,8 @@ nav:
|
||||
- Serverless Chatbot from any website: examples/serverless_website_chatbot.md
|
||||
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
|
||||
- API references:
|
||||
- Python API: python/python.md
|
||||
- OSS Python API: python/python.md
|
||||
- SaaS Python API: python/saas-python.md
|
||||
- Javascript API: javascript/modules.md
|
||||
- LanceDB Cloud↗: https://noteforms.com/forms/lancedb-mailing-list-cloud-kty1o5?notionforms=1&utm_source=notionforms
|
||||
|
||||
|
||||
18
docs/src/python/saas-python.md
Normal file
18
docs/src/python/saas-python.md
Normal file
@@ -0,0 +1,18 @@
|
||||
# LanceDB Python API Reference
|
||||
|
||||
## Installation
|
||||
|
||||
```shell
|
||||
pip install lancedb
|
||||
```
|
||||
|
||||
## Connection
|
||||
|
||||
::: lancedb.connect
|
||||
|
||||
::: lancedb.remote.db.RemoteDBConnection
|
||||
|
||||
## Table
|
||||
|
||||
::: lancedb.remote.table.RemoteTable
|
||||
|
||||
@@ -27,7 +27,7 @@ def connect(
|
||||
uri: URI,
|
||||
*,
|
||||
api_key: Optional[str] = None,
|
||||
region: str = "us-west-2",
|
||||
region: str = "us-east-1",
|
||||
host_override: Optional[str] = None,
|
||||
) -> DBConnection:
|
||||
"""Connect to a LanceDB database.
|
||||
@@ -39,7 +39,7 @@ def connect(
|
||||
api_key: str, optional
|
||||
If presented, connect to LanceDB cloud.
|
||||
Otherwise, connect to a database on file system or cloud storage.
|
||||
region: str, default "us-west-2"
|
||||
region: str, default "us-east-1"
|
||||
The region to use for LanceDB Cloud.
|
||||
host_override: str, optional
|
||||
The override url for LanceDB Cloud.
|
||||
|
||||
@@ -59,13 +59,17 @@ class RemoteDBConnection(DBConnection):
|
||||
return f"RemoveConnect(name={self.db_name})"
|
||||
|
||||
@override
|
||||
def table_names(self, page_token: Optional[str] = None, limit=10) -> Iterable[str]:
|
||||
def table_names(
|
||||
self, page_token: Optional[str] = None, limit: int = 10
|
||||
) -> Iterable[str]:
|
||||
"""List the names of all tables in the database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
page_token: str
|
||||
The last token to start the new page.
|
||||
limit: int, default 10
|
||||
The maximum number of tables to return for each page.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -120,6 +124,97 @@ class RemoteDBConnection(DBConnection):
|
||||
fill_value: float = 0.0,
|
||||
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||
) -> Table:
|
||||
"""Create a [Table][lancedb.table.Table] in the database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
The name of the table.
|
||||
data: The data to initialize the table, *optional*
|
||||
User must provide at least one of `data` or `schema`.
|
||||
Acceptable types are:
|
||||
|
||||
- dict or list-of-dict
|
||||
|
||||
- pandas.DataFrame
|
||||
|
||||
- pyarrow.Table or pyarrow.RecordBatch
|
||||
schema: The schema of the table, *optional*
|
||||
Acceptable types are:
|
||||
|
||||
- pyarrow.Schema
|
||||
|
||||
- [LanceModel][lancedb.pydantic.LanceModel]
|
||||
on_bad_vectors: str, default "error"
|
||||
What to do if any of the vectors are not the same size or contains NaNs.
|
||||
One of "error", "drop", "fill".
|
||||
fill_value: float
|
||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceTable
|
||||
A reference to the newly created table.
|
||||
|
||||
!!! note
|
||||
|
||||
The vector index won't be created by default.
|
||||
To create the index, call the `create_index` method on the table.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
Can create with list of tuples or dictionaries:
|
||||
|
||||
>>> import lancedb
|
||||
>>> db = lancedb.connect("db://test-project-8f45eb")
|
||||
>>> data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
|
||||
... {"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1}]
|
||||
>>> db.create_table("my_table", data)
|
||||
LanceTable(my_table)
|
||||
|
||||
You can also pass a pandas DataFrame:
|
||||
|
||||
>>> import pandas as pd
|
||||
>>> data = pd.DataFrame({
|
||||
... "vector": [[1.1, 1.2], [0.2, 1.8]],
|
||||
... "lat": [45.5, 40.1],
|
||||
... "long": [-122.7, -74.1]
|
||||
... })
|
||||
>>> db.create_table("table2", data)
|
||||
LanceTable(table2)
|
||||
|
||||
>>> custom_schema = pa.schema([
|
||||
... pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
... pa.field("lat", pa.float32()),
|
||||
... pa.field("long", pa.float32())
|
||||
... ])
|
||||
>>> db.create_table("table3", data, schema = custom_schema)
|
||||
LanceTable(table3)
|
||||
|
||||
It is also possible to create an table from `[Iterable[pa.RecordBatch]]`:
|
||||
|
||||
>>> import pyarrow as pa
|
||||
>>> def make_batches():
|
||||
... for i in range(5):
|
||||
... yield pa.RecordBatch.from_arrays(
|
||||
... [
|
||||
... pa.array([[3.1, 4.1], [5.9, 26.5]],
|
||||
... pa.list_(pa.float32(), 2)),
|
||||
... pa.array(["foo", "bar"]),
|
||||
... pa.array([10.0, 20.0]),
|
||||
... ],
|
||||
... ["vector", "item", "price"],
|
||||
... )
|
||||
>>> schema=pa.schema([
|
||||
... pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
... pa.field("item", pa.utf8()),
|
||||
... pa.field("price", pa.float32()),
|
||||
... ])
|
||||
>>> db.create_table("table4", make_batches(), schema=schema)
|
||||
LanceTable(table4)
|
||||
|
||||
"""
|
||||
if data is None and schema is None:
|
||||
raise ValueError("Either data or schema must be provided.")
|
||||
if embedding_functions is not None:
|
||||
|
||||
@@ -37,7 +37,10 @@ class RemoteTable(Table):
|
||||
|
||||
@cached_property
|
||||
def schema(self) -> pa.Schema:
|
||||
"""Return the schema of the table."""
|
||||
"""The [Arrow Schema](https://arrow.apache.org/docs/python/api/datatypes.html#)
|
||||
of this Table
|
||||
|
||||
"""
|
||||
resp = self._conn._loop.run_until_complete(
|
||||
self._conn._client.post(f"/v1/table/{self._name}/describe/")
|
||||
)
|
||||
@@ -53,24 +56,17 @@ class RemoteTable(Table):
|
||||
return resp["version"]
|
||||
|
||||
def to_arrow(self) -> pa.Table:
|
||||
"""Return the table as an Arrow table."""
|
||||
"""to_arrow() is not supported on the LanceDB cloud"""
|
||||
raise NotImplementedError("to_arrow() is not supported on the LanceDB cloud")
|
||||
|
||||
def to_pandas(self):
|
||||
"""Return the table as a Pandas DataFrame.
|
||||
|
||||
Intercept `to_arrow()` for better error message.
|
||||
"""
|
||||
"""to_pandas() is not supported on the LanceDB cloud"""
|
||||
return NotImplementedError("to_pandas() is not supported on the LanceDB cloud")
|
||||
|
||||
def create_index(
|
||||
self,
|
||||
metric="L2",
|
||||
num_partitions=256,
|
||||
num_sub_vectors=96,
|
||||
vector_column_name: str = VECTOR_COLUMN_NAME,
|
||||
replace: bool = True,
|
||||
accelerator: Optional[str] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
):
|
||||
"""Create an index on the table.
|
||||
@@ -81,39 +77,28 @@ class RemoteTable(Table):
|
||||
----------
|
||||
metric : str
|
||||
The metric to use for the index. Default is "L2".
|
||||
num_partitions : int
|
||||
The number of partitions to use for the index. Default is 256.
|
||||
num_sub_vectors : int
|
||||
The number of sub-vectors to use for the index. Default is 96.
|
||||
vector_column_name : str
|
||||
The name of the vector column. Default is "vector".
|
||||
replace : bool
|
||||
Whether to replace the existing index. Default is True.
|
||||
accelerator : str, optional
|
||||
If set, use the given accelerator to create the index.
|
||||
Default is None. Currently not supported.
|
||||
index_cache_size : int, optional
|
||||
The size of the index cache in number of entries. Default value is 256.
|
||||
|
||||
Examples
|
||||
--------
|
||||
import lancedb
|
||||
import uuid
|
||||
from lancedb.schema import vector
|
||||
conn = lancedb.connect("db://...", api_key="...", region="...")
|
||||
table_name = uuid.uuid4().hex
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.uint32(), False),
|
||||
pa.field("vector", vector(128), False),
|
||||
pa.field("s", pa.string(), False),
|
||||
]
|
||||
)
|
||||
table = conn.create_table(
|
||||
table_name,
|
||||
schema=schema,
|
||||
)
|
||||
table.create_index()
|
||||
>>> import lancedb
|
||||
>>> import uuid
|
||||
>>> from lancedb.schema import vector
|
||||
>>> conn = lancedb.connect("db://...", api_key="...", region="...")
|
||||
>>> table_name = uuid.uuid4().hex
|
||||
>>> schema = pa.schema(
|
||||
... [
|
||||
... pa.field("id", pa.uint32(), False),
|
||||
... pa.field("vector", vector(128), False),
|
||||
... pa.field("s", pa.string(), False),
|
||||
... ]
|
||||
... )
|
||||
>>> table = conn.create_table(
|
||||
>>> table_name,
|
||||
>>> schema=schema,
|
||||
>>> )
|
||||
>>> table.create_index("L2", "vector")
|
||||
"""
|
||||
index_type = "vector"
|
||||
|
||||
@@ -135,6 +120,28 @@ class RemoteTable(Table):
|
||||
on_bad_vectors: str = "error",
|
||||
fill_value: float = 0.0,
|
||||
) -> int:
|
||||
"""Add more data to the [Table](Table). It has the same API signature as the OSS version.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data: DATA
|
||||
The data to insert into the table. Acceptable types are:
|
||||
|
||||
- dict or list-of-dict
|
||||
|
||||
- pandas.DataFrame
|
||||
|
||||
- pyarrow.Table or pyarrow.RecordBatch
|
||||
mode: str
|
||||
The mode to use when writing the data. Valid values are
|
||||
"append" and "overwrite".
|
||||
on_bad_vectors: str, default "error"
|
||||
What to do if any of the vectors are not the same size or contains NaNs.
|
||||
One of "error", "drop", "fill".
|
||||
fill_value: float, default 0.
|
||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||
|
||||
"""
|
||||
data = _sanitize_data(
|
||||
data,
|
||||
self.schema,
|
||||
@@ -158,6 +165,58 @@ class RemoteTable(Table):
|
||||
def search(
|
||||
self, query: Union[VEC, str], vector_column_name: str = VECTOR_COLUMN_NAME
|
||||
) -> LanceVectorQueryBuilder:
|
||||
"""Create a search query to find the nearest neighbors
|
||||
of the given query vector. We currently support [vector search][search]
|
||||
|
||||
All query options are defined in [Query][lancedb.query.Query].
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import lancedb
|
||||
>>> db = lancedb.connect("db://...", api_key="...", region="...")
|
||||
>>> data = [
|
||||
... {"original_width": 100, "caption": "bar", "vector": [0.1, 2.3, 4.5]},
|
||||
... {"original_width": 2000, "caption": "foo", "vector": [0.5, 3.4, 1.3]},
|
||||
... {"original_width": 3000, "caption": "test", "vector": [0.3, 6.2, 2.6]}
|
||||
... ]
|
||||
>>> table = db.create_table("my_table", data)
|
||||
>>> query = [0.4, 1.4, 2.4]
|
||||
>>> (table.search(query, vector_column_name="vector")
|
||||
... .where("original_width > 1000", prefilter=True)
|
||||
... .select(["caption", "original_width"])
|
||||
... .limit(2)
|
||||
... .to_pandas())
|
||||
caption original_width vector _distance
|
||||
0 foo 2000 [0.5, 3.4, 1.3] 5.220000
|
||||
1 test 3000 [0.3, 6.2, 2.6] 23.089996
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query: list/np.ndarray/str/PIL.Image.Image, default None
|
||||
The targetted vector to search for.
|
||||
|
||||
- *default None*.
|
||||
Acceptable types are: list, np.ndarray, PIL.Image.Image
|
||||
|
||||
- If None then the select/where/limit clauses are applied to filter
|
||||
the table
|
||||
vector_column_name: str
|
||||
The name of the vector column to search.
|
||||
*default "vector"*
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceQueryBuilder
|
||||
A query builder object representing the query.
|
||||
Once executed, the query returns
|
||||
|
||||
- selected columns
|
||||
|
||||
- the vector
|
||||
|
||||
- and also the "_distance" column which is the distance between the query
|
||||
vector and the returned vector.
|
||||
"""
|
||||
return LanceVectorQueryBuilder(self, query, vector_column_name)
|
||||
|
||||
def _execute_query(self, query: Query) -> pa.Table:
|
||||
@@ -165,7 +224,53 @@ class RemoteTable(Table):
|
||||
return self._conn._loop.run_until_complete(result).to_arrow()
|
||||
|
||||
def delete(self, predicate: str):
|
||||
"""Delete rows from the table."""
|
||||
"""Delete rows from the table.
|
||||
|
||||
This can be used to delete a single row, many rows, all rows, or
|
||||
sometimes no rows (if your predicate matches nothing).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
predicate: str
|
||||
The SQL where clause to use when deleting rows.
|
||||
|
||||
- For example, 'x = 2' or 'x IN (1, 2, 3)'.
|
||||
|
||||
The filter must not be empty, or it will error.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import lancedb
|
||||
>>> data = [
|
||||
... {"x": 1, "vector": [1, 2]},
|
||||
... {"x": 2, "vector": [3, 4]},
|
||||
... {"x": 3, "vector": [5, 6]}
|
||||
... ]
|
||||
>>> db = lancedb.connect("db://...", api_key="...", region="...")
|
||||
>>> table = db.create_table("my_table", data)
|
||||
>>> table.search([10,10]).to_pandas()
|
||||
x vector _distance
|
||||
0 3 [5.0, 6.0] 41.0
|
||||
1 2 [3.0, 4.0] 85.0
|
||||
2 1 [1.0, 2.0] 145.0
|
||||
>>> table.delete("x = 2")
|
||||
>>> table.search([10,10]).to_pandas()
|
||||
x vector _distance
|
||||
0 3 [5.0, 6.0] 41.0
|
||||
1 1 [1.0, 2.0] 145.0
|
||||
|
||||
If you have a list of values to delete, you can combine them into a
|
||||
stringified list and use the `IN` operator:
|
||||
|
||||
>>> to_remove = [1, 3]
|
||||
>>> to_remove = ", ".join([str(v) for v in to_remove])
|
||||
>>> to_remove
|
||||
'1, 3'
|
||||
>>> table.delete(f"x IN ({to_remove})")
|
||||
>>> table.search([10,10]).to_pandas()
|
||||
x vector _distance
|
||||
0 2 [3.0, 4.0] 85.0
|
||||
"""
|
||||
payload = {"predicate": predicate}
|
||||
self._conn._loop.run_until_complete(
|
||||
self._conn._client.post(f"/v1/table/{self._name}/delete/", data=payload)
|
||||
|
||||
Reference in New Issue
Block a user