mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-04 10:52:56 +00:00
feat: port create_table to the async python API and the remote rust API (#1031)
I've also started `ASYNC_MIGRATION.MD` to keep track of the breaking changes from sync to async python.
This commit is contained in:
24
python/ASYNC_MIGRATION.md
Normal file
24
python/ASYNC_MIGRATION.md
Normal file
@@ -0,0 +1,24 @@
|
||||
# Migration from Sync to Async API
|
||||
|
||||
A new asynchronous API has been added to LanceDb. This API is built
|
||||
on top of the rust lancedb crate (instead of being built on top of
|
||||
pylance). This will help keep the various language bindings in sync.
|
||||
There are some slight changes between the synchronous and the asynchronous
|
||||
APIs. This document will help you migrate. These changes relate mostly
|
||||
to the Connection and Table classes.
|
||||
|
||||
## Almost all functions are async
|
||||
|
||||
The most important change is that almost all functions are now async.
|
||||
This means the functions now return `asyncio` coroutines. You will
|
||||
need to use `await` to call these functions.
|
||||
|
||||
## Connection
|
||||
|
||||
No changes yet.
|
||||
|
||||
## Table
|
||||
|
||||
* Previously `Table.schema` was a property. Now it is an async method.
|
||||
* The method `Table.__len__` was removed and `len(table)` will no longer
|
||||
work. Use `Table.count_rows` instead.
|
||||
@@ -14,6 +14,7 @@ name = "_lancedb"
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
arrow = { version = "50.0.0", features = ["pyarrow"] }
|
||||
lancedb = { path = "../rust/lancedb" }
|
||||
env_logger = "0.10"
|
||||
pyo3 = { version = "0.20", features = ["extension-module", "abi3-py38"] }
|
||||
@@ -23,4 +24,7 @@ pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
|
||||
lzma-sys = { version = "*", features = ["static"] }
|
||||
|
||||
[build-dependencies]
|
||||
pyo3-build-config = { version = "0.20.3", features = ["extension-module", "abi3-py38"] }
|
||||
pyo3-build-config = { version = "0.20.3", features = [
|
||||
"extension-module",
|
||||
"abi3-py38",
|
||||
] }
|
||||
|
||||
@@ -1,7 +1,19 @@
|
||||
from typing import Optional
|
||||
|
||||
import pyarrow as pa
|
||||
|
||||
class Connection(object):
|
||||
async def table_names(self) -> list[str]: ...
|
||||
async def create_table(
|
||||
self, name: str, mode: str, data: pa.RecordBatchReader
|
||||
) -> Table: ...
|
||||
async def create_empty_table(
|
||||
self, name: str, mode: str, schema: pa.Schema
|
||||
) -> Table: ...
|
||||
|
||||
class Table(object):
|
||||
def name(self) -> str: ...
|
||||
async def schema(self) -> pa.Schema: ...
|
||||
|
||||
async def connect(
|
||||
uri: str,
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Union
|
||||
from typing import Iterable, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
@@ -38,3 +38,99 @@ class Credential(str):
|
||||
|
||||
def sanitize_uri(uri: URI) -> str:
|
||||
return str(uri)
|
||||
|
||||
|
||||
def _casting_recordbatch_iter(
|
||||
input_iter: Iterable[pa.RecordBatch], schema: pa.Schema
|
||||
) -> Iterable[pa.RecordBatch]:
|
||||
"""
|
||||
Wrapper around an iterator of record batches. If the batches don't match the
|
||||
schema, try to cast them to the schema. If that fails, raise an error.
|
||||
|
||||
This is helpful for users who might have written the iterator with default
|
||||
data types in PyArrow, but specified more specific types in the schema. For
|
||||
example, PyArrow defaults to float64 for floating point types, but Lance
|
||||
uses float32 for vectors.
|
||||
"""
|
||||
for batch in input_iter:
|
||||
if not isinstance(batch, pa.RecordBatch):
|
||||
raise TypeError(f"Expected RecordBatch, got {type(batch)}")
|
||||
if batch.schema != schema:
|
||||
try:
|
||||
# RecordBatch doesn't have a cast method, but table does.
|
||||
batch = pa.Table.from_batches([batch]).cast(schema).to_batches()[0]
|
||||
except pa.lib.ArrowInvalid:
|
||||
raise ValueError(
|
||||
f"Input RecordBatch iterator yielded a batch with schema that "
|
||||
f"does not match the expected schema.\nExpected:\n{schema}\n"
|
||||
f"Got:\n{batch.schema}"
|
||||
)
|
||||
yield batch
|
||||
|
||||
|
||||
def data_to_reader(
|
||||
data: DATA, schema: Optional[pa.Schema] = None
|
||||
) -> pa.RecordBatchReader:
|
||||
"""Convert various types of input into a RecordBatchReader"""
|
||||
if pd is not None and isinstance(data, pd.DataFrame):
|
||||
return pa.Table.from_pandas(data, schema=schema).to_reader()
|
||||
elif isinstance(data, pa.Table):
|
||||
return data.to_reader()
|
||||
elif isinstance(data, pa.RecordBatch):
|
||||
return pa.Table.from_batches([data]).to_reader()
|
||||
# elif isinstance(data, LanceDataset):
|
||||
# return data_obj.scanner().to_reader()
|
||||
elif isinstance(data, pa.dataset.Dataset):
|
||||
return pa.dataset.Scanner.from_dataset(data).to_reader()
|
||||
elif isinstance(data, pa.dataset.Scanner):
|
||||
return data.to_reader()
|
||||
elif isinstance(data, pa.RecordBatchReader):
|
||||
return data
|
||||
elif (
|
||||
type(data).__module__.startswith("polars")
|
||||
and data.__class__.__name__ == "DataFrame"
|
||||
):
|
||||
return data.to_arrow().to_reader()
|
||||
# for other iterables, assume they are of type Iterable[RecordBatch]
|
||||
elif isinstance(data, Iterable):
|
||||
if schema is not None:
|
||||
data = _casting_recordbatch_iter(data, schema)
|
||||
return pa.RecordBatchReader.from_batches(schema, data)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Must provide schema to write dataset from RecordBatch iterable"
|
||||
)
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Unknown data type {type(data)}. "
|
||||
"Please check "
|
||||
"https://lancedb.github.io/lance/read_and_write.html "
|
||||
"to see supported types."
|
||||
)
|
||||
|
||||
|
||||
def validate_schema(schema: pa.Schema):
|
||||
"""
|
||||
Make sure the metadata is valid utf8
|
||||
"""
|
||||
if schema.metadata is not None:
|
||||
_validate_metadata(schema.metadata)
|
||||
|
||||
|
||||
def _validate_metadata(metadata: dict):
|
||||
"""
|
||||
Make sure the metadata values are valid utf8 (can be nested)
|
||||
|
||||
Raises ValueError if not valid utf8
|
||||
"""
|
||||
for k, v in metadata.items():
|
||||
if isinstance(v, bytes):
|
||||
try:
|
||||
v.decode("utf8")
|
||||
except UnicodeDecodeError:
|
||||
raise ValueError(
|
||||
f"Metadata key {k} is not valid utf8. "
|
||||
"Consider base64 encode for generic binary metadata."
|
||||
)
|
||||
elif isinstance(v, dict):
|
||||
_validate_metadata(v)
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
import os
|
||||
from abc import abstractmethod
|
||||
from pathlib import Path
|
||||
@@ -22,7 +23,12 @@ import pyarrow as pa
|
||||
from overrides import EnforceOverrides, override
|
||||
from pyarrow import fs
|
||||
|
||||
from .table import LanceTable, Table
|
||||
from lancedb.common import data_to_reader, validate_schema
|
||||
from lancedb.embeddings.registry import EmbeddingFunctionRegistry
|
||||
from lancedb.utils.events import register_event
|
||||
|
||||
from .pydantic import LanceModel
|
||||
from .table import AsyncLanceTable, LanceTable, Table, _sanitize_data
|
||||
from .util import fs_from_uri, get_uri_location, get_uri_scheme, join_uri
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -31,7 +37,6 @@ if TYPE_CHECKING:
|
||||
from ._lancedb import Connection as LanceDbConnection
|
||||
from .common import DATA, URI
|
||||
from .embeddings import EmbeddingFunctionConfig
|
||||
from .pydantic import LanceModel
|
||||
|
||||
|
||||
class DBConnection(EnforceOverrides):
|
||||
@@ -644,6 +649,7 @@ class AsyncLanceDBConnection(AsyncConnection):
|
||||
page_token=None,
|
||||
limit=None,
|
||||
) -> Iterable[str]:
|
||||
# TODO: hook in page_token and limit
|
||||
return await self._inner.table_names()
|
||||
|
||||
@override
|
||||
@@ -657,8 +663,66 @@ class AsyncLanceDBConnection(AsyncConnection):
|
||||
on_bad_vectors: str = "error",
|
||||
fill_value: float = 0.0,
|
||||
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
|
||||
) -> LanceTable:
|
||||
raise NotImplementedError
|
||||
) -> Table:
|
||||
if mode.lower() not in ["create", "overwrite"]:
|
||||
raise ValueError("mode must be either 'create' or 'overwrite'")
|
||||
|
||||
if inspect.isclass(schema) and issubclass(schema, LanceModel):
|
||||
# convert LanceModel to pyarrow schema
|
||||
# note that it's possible this contains
|
||||
# embedding function metadata already
|
||||
schema = schema.to_arrow_schema()
|
||||
|
||||
metadata = None
|
||||
if embedding_functions is not None:
|
||||
# If we passed in embedding functions explicitly
|
||||
# then we'll override any schema metadata that
|
||||
# may was implicitly specified by the LanceModel schema
|
||||
registry = EmbeddingFunctionRegistry.get_instance()
|
||||
metadata = registry.get_table_metadata(embedding_functions)
|
||||
|
||||
if data is not None:
|
||||
data = _sanitize_data(
|
||||
data,
|
||||
schema,
|
||||
metadata=metadata,
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
fill_value=fill_value,
|
||||
)
|
||||
|
||||
if schema is None:
|
||||
if data is None:
|
||||
raise ValueError("Either data or schema must be provided")
|
||||
elif hasattr(data, "schema"):
|
||||
schema = data.schema
|
||||
elif isinstance(data, Iterable):
|
||||
if metadata:
|
||||
raise TypeError(
|
||||
(
|
||||
"Persistent embedding functions not yet "
|
||||
"supported for generator data input"
|
||||
)
|
||||
)
|
||||
|
||||
if metadata:
|
||||
schema = schema.with_metadata(metadata)
|
||||
validate_schema(schema)
|
||||
|
||||
if mode == "create" and exist_ok:
|
||||
mode = "exist_ok"
|
||||
|
||||
if data is None:
|
||||
new_table = await self._inner.create_empty_table(name, mode, schema)
|
||||
else:
|
||||
data = data_to_reader(data, schema)
|
||||
new_table = await self._inner.create_table(
|
||||
name,
|
||||
mode,
|
||||
data,
|
||||
)
|
||||
|
||||
register_event("create_table")
|
||||
return AsyncLanceTable(new_table)
|
||||
|
||||
@override
|
||||
async def open_table(self, name: str) -> LanceTable:
|
||||
|
||||
@@ -28,6 +28,7 @@ import pyarrow.compute as pc
|
||||
import pyarrow.fs as pa_fs
|
||||
from lance import LanceDataset
|
||||
from lance.vector import vec_to_table
|
||||
from overrides import override
|
||||
|
||||
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||
from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
|
||||
@@ -48,6 +49,7 @@ if TYPE_CHECKING:
|
||||
import PIL
|
||||
from lance.dataset import CleanupStats, ReaderLike
|
||||
|
||||
from ._lancedb import Table as LanceDBTable
|
||||
from .db import LanceDBConnection
|
||||
|
||||
|
||||
@@ -1780,3 +1782,715 @@ def _sanitize_nans(data, fill_value, on_bad_vectors, vec_arr, vector_column_name
|
||||
is_full = np.any(~is_value_nan.reshape(-1, vec_arr.type.list_size), axis=1)
|
||||
data = data.filter(is_full)
|
||||
return data
|
||||
|
||||
|
||||
class AsyncTable(ABC):
|
||||
"""
|
||||
A Table is a collection of Records in a LanceDB Database.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
Create using [DBConnection.create_table][lancedb.DBConnection.create_table]
|
||||
(more examples in that method's documentation).
|
||||
|
||||
>>> import lancedb
|
||||
>>> db = lancedb.connect("./.lancedb")
|
||||
>>> table = db.create_table("my_table", data=[{"vector": [1.1, 1.2], "b": 2}])
|
||||
>>> table.head()
|
||||
pyarrow.Table
|
||||
vector: fixed_size_list<item: float>[2]
|
||||
child 0, item: float
|
||||
b: int64
|
||||
----
|
||||
vector: [[[1.1,1.2]]]
|
||||
b: [[2]]
|
||||
|
||||
Can append new data with [Table.add()][lancedb.table.Table.add].
|
||||
|
||||
>>> table.add([{"vector": [0.5, 1.3], "b": 4}])
|
||||
|
||||
Can query the table with [Table.search][lancedb.table.Table.search].
|
||||
|
||||
>>> table.search([0.4, 0.4]).select(["b", "vector"]).to_pandas()
|
||||
b vector _distance
|
||||
0 4 [0.5, 1.3] 0.82
|
||||
1 2 [1.1, 1.2] 1.13
|
||||
|
||||
Search queries are much faster when an index is created. See
|
||||
[Table.create_index][lancedb.table.Table.create_index].
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def name(self) -> str:
|
||||
"""The name of the table."""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def schema(self) -> pa.Schema:
|
||||
"""The [Arrow Schema](https://arrow.apache.org/docs/python/api/datatypes.html#)
|
||||
of this Table
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def count_rows(self, filter: Optional[str] = None) -> int:
|
||||
"""
|
||||
Count the number of rows in the table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filter: str, optional
|
||||
A SQL where clause to filter the rows to count.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
async def to_pandas(self) -> "pd.DataFrame":
|
||||
"""Return the table as a pandas DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.DataFrame
|
||||
"""
|
||||
return self.to_arrow().to_pandas()
|
||||
|
||||
@abstractmethod
|
||||
async def to_arrow(self) -> pa.Table:
|
||||
"""Return the table as a pyarrow Table.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pa.Table
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
async def create_index(
|
||||
self,
|
||||
metric="L2",
|
||||
num_partitions=256,
|
||||
num_sub_vectors=96,
|
||||
vector_column_name: str = VECTOR_COLUMN_NAME,
|
||||
replace: bool = True,
|
||||
accelerator: Optional[str] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
):
|
||||
"""Create an index on the table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metric: str, default "L2"
|
||||
The distance metric to use when creating the index.
|
||||
Valid values are "L2", "cosine", or "dot".
|
||||
L2 is euclidean distance.
|
||||
num_partitions: int, default 256
|
||||
The number of IVF partitions to use when creating the index.
|
||||
Default is 256.
|
||||
num_sub_vectors: int, default 96
|
||||
The number of PQ sub-vectors to use when creating the index.
|
||||
Default is 96.
|
||||
vector_column_name: str, default "vector"
|
||||
The vector column name to create the index.
|
||||
replace: bool, default True
|
||||
- If True, replace the existing index if it exists.
|
||||
|
||||
- If False, raise an error if duplicate index exists.
|
||||
accelerator: str, default None
|
||||
If set, use the given accelerator to create the index.
|
||||
Only support "cuda" for now.
|
||||
index_cache_size : int, optional
|
||||
The size of the index cache in number of entries. Default value is 256.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def create_scalar_index(
|
||||
self,
|
||||
column: str,
|
||||
*,
|
||||
replace: bool = True,
|
||||
):
|
||||
"""Create a scalar index on a column.
|
||||
|
||||
Scalar indices, like vector indices, can be used to speed up scans. A scalar
|
||||
index can speed up scans that contain filter expressions on the indexed column.
|
||||
For example, the following scan will be faster if the column ``my_col`` has
|
||||
a scalar index:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import lancedb
|
||||
|
||||
db = lancedb.connect("/data/lance")
|
||||
img_table = db.open_table("images")
|
||||
my_df = img_table.search().where("my_col = 7", prefilter=True).to_pandas()
|
||||
|
||||
Scalar indices can also speed up scans containing a vector search and a
|
||||
prefilter:
|
||||
|
||||
.. code-block::python
|
||||
|
||||
import lancedb
|
||||
|
||||
db = lancedb.connect("/data/lance")
|
||||
img_table = db.open_table("images")
|
||||
img_table.search([1, 2, 3, 4], vector_column_name="vector")
|
||||
.where("my_col != 7", prefilter=True)
|
||||
.to_pandas()
|
||||
|
||||
Scalar indices can only speed up scans for basic filters using
|
||||
equality, comparison, range (e.g. ``my_col BETWEEN 0 AND 100``), and set
|
||||
membership (e.g. `my_col IN (0, 1, 2)`)
|
||||
|
||||
Scalar indices can be used if the filter contains multiple indexed columns and
|
||||
the filter criteria are AND'd or OR'd together
|
||||
(e.g. ``my_col < 0 AND other_col> 100``)
|
||||
|
||||
Scalar indices may be used if the filter contains non-indexed columns but,
|
||||
depending on the structure of the filter, they may not be usable. For example,
|
||||
if the column ``not_indexed`` does not have a scalar index then the filter
|
||||
``my_col = 0 OR not_indexed = 1`` will not be able to use any scalar index on
|
||||
``my_col``.
|
||||
|
||||
**Experimental API**
|
||||
|
||||
Parameters
|
||||
----------
|
||||
column : str
|
||||
The column to be indexed. Must be a boolean, integer, float,
|
||||
or string column.
|
||||
replace : bool, default True
|
||||
Replace the existing index if it exists.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import lance
|
||||
|
||||
dataset = lance.dataset("./images.lance")
|
||||
dataset.create_scalar_index("category")
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def add(
|
||||
self,
|
||||
data: DATA,
|
||||
mode: str = "append",
|
||||
on_bad_vectors: str = "error",
|
||||
fill_value: float = 0.0,
|
||||
):
|
||||
"""Add more data to the [Table](Table).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data: DATA
|
||||
The data to insert into the table. Acceptable types are:
|
||||
|
||||
- dict or list-of-dict
|
||||
|
||||
- pandas.DataFrame
|
||||
|
||||
- pyarrow.Table or pyarrow.RecordBatch
|
||||
mode: str
|
||||
The mode to use when writing the data. Valid values are
|
||||
"append" and "overwrite".
|
||||
on_bad_vectors: str, default "error"
|
||||
What to do if any of the vectors are not the same size or contains NaNs.
|
||||
One of "error", "drop", "fill".
|
||||
fill_value: float, default 0.
|
||||
The value to use when filling vectors. Only used if on_bad_vectors="fill".
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def merge_insert(self, on: Union[str, Iterable[str]]) -> LanceMergeInsertBuilder:
|
||||
"""
|
||||
Returns a [`LanceMergeInsertBuilder`][lancedb.merge.LanceMergeInsertBuilder]
|
||||
that can be used to create a "merge insert" operation
|
||||
|
||||
This operation can add rows, update rows, and remove rows all in a single
|
||||
transaction. It is a very generic tool that can be used to create
|
||||
behaviors like "insert if not exists", "update or insert (i.e. upsert)",
|
||||
or even replace a portion of existing data with new data (e.g. replace
|
||||
all data where month="january")
|
||||
|
||||
The merge insert operation works by combining new data from a
|
||||
**source table** with existing data in a **target table** by using a
|
||||
join. There are three categories of records.
|
||||
|
||||
"Matched" records are records that exist in both the source table and
|
||||
the target table. "Not matched" records exist only in the source table
|
||||
(e.g. these are new data) "Not matched by source" records exist only
|
||||
in the target table (this is old data)
|
||||
|
||||
The builder returned by this method can be used to customize what
|
||||
should happen for each category of data.
|
||||
|
||||
Please note that the data may appear to be reordered as part of this
|
||||
operation. This is because updated rows will be deleted from the
|
||||
dataset and then reinserted at the end with the new values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
on: Union[str, Iterable[str]]
|
||||
A column (or columns) to join on. This is how records from the
|
||||
source table and target table are matched. Typically this is some
|
||||
kind of key or id column.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import lancedb
|
||||
>>> data = pa.table({"a": [2, 1, 3], "b": ["a", "b", "c"]})
|
||||
>>> db = lancedb.connect("./.lancedb")
|
||||
>>> table = db.create_table("my_table", data)
|
||||
>>> new_data = pa.table({"a": [2, 3, 4], "b": ["x", "y", "z"]})
|
||||
>>> # Perform a "upsert" operation
|
||||
>>> table.merge_insert("a") \\
|
||||
... .when_matched_update_all() \\
|
||||
... .when_not_matched_insert_all() \\
|
||||
... .execute(new_data)
|
||||
>>> # The order of new rows is non-deterministic since we use
|
||||
>>> # a hash-join as part of this operation and so we sort here
|
||||
>>> table.to_arrow().sort_by("a").to_pandas()
|
||||
a b
|
||||
0 1 b
|
||||
1 2 x
|
||||
2 3 y
|
||||
3 4 z
|
||||
"""
|
||||
on = [on] if isinstance(on, str) else list(on.iter())
|
||||
|
||||
return LanceMergeInsertBuilder(self, on)
|
||||
|
||||
@abstractmethod
|
||||
async def search(
|
||||
self,
|
||||
query: Optional[Union[VEC, str, "PIL.Image.Image", Tuple]] = None,
|
||||
vector_column_name: Optional[str] = None,
|
||||
query_type: str = "auto",
|
||||
) -> LanceQueryBuilder:
|
||||
"""Create a search query to find the nearest neighbors
|
||||
of the given query vector. We currently support [vector search][search]
|
||||
and [full-text search][experimental-full-text-search].
|
||||
|
||||
All query options are defined in [Query][lancedb.query.Query].
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import lancedb
|
||||
>>> db = lancedb.connect("./.lancedb")
|
||||
>>> data = [
|
||||
... {"original_width": 100, "caption": "bar", "vector": [0.1, 2.3, 4.5]},
|
||||
... {"original_width": 2000, "caption": "foo", "vector": [0.5, 3.4, 1.3]},
|
||||
... {"original_width": 3000, "caption": "test", "vector": [0.3, 6.2, 2.6]}
|
||||
... ]
|
||||
>>> table = db.create_table("my_table", data)
|
||||
>>> query = [0.4, 1.4, 2.4]
|
||||
>>> (table.search(query)
|
||||
... .where("original_width > 1000", prefilter=True)
|
||||
... .select(["caption", "original_width", "vector"])
|
||||
... .limit(2)
|
||||
... .to_pandas())
|
||||
caption original_width vector _distance
|
||||
0 foo 2000 [0.5, 3.4, 1.3] 5.220000
|
||||
1 test 3000 [0.3, 6.2, 2.6] 23.089996
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query: list/np.ndarray/str/PIL.Image.Image, default None
|
||||
The targetted vector to search for.
|
||||
|
||||
- *default None*.
|
||||
Acceptable types are: list, np.ndarray, PIL.Image.Image
|
||||
|
||||
- If None then the select/where/limit clauses are applied to filter
|
||||
the table
|
||||
vector_column_name: str, optional
|
||||
The name of the vector column to search.
|
||||
|
||||
The vector column needs to be a pyarrow fixed size list type
|
||||
|
||||
- If not specified then the vector column is inferred from
|
||||
the table schema
|
||||
|
||||
- If the table has multiple vector columns then the *vector_column_name*
|
||||
needs to be specified. Otherwise, an error is raised.
|
||||
query_type: str
|
||||
*default "auto"*.
|
||||
Acceptable types are: "vector", "fts", "hybrid", or "auto"
|
||||
|
||||
- If "auto" then the query type is inferred from the query;
|
||||
|
||||
- If `query` is a list/np.ndarray then the query type is
|
||||
"vector";
|
||||
|
||||
- If `query` is a PIL.Image.Image then either do vector search,
|
||||
or raise an error if no corresponding embedding function is found.
|
||||
|
||||
- If `query` is a string, then the query type is "vector" if the
|
||||
table has embedding functions else the query type is "fts"
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceQueryBuilder
|
||||
A query builder object representing the query.
|
||||
Once executed, the query returns
|
||||
|
||||
- selected columns
|
||||
|
||||
- the vector
|
||||
|
||||
- and also the "_distance" column which is the distance between the query
|
||||
vector and the returned vector.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def _execute_query(self, query: Query) -> pa.Table:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def _do_merge(
|
||||
self,
|
||||
merge: LanceMergeInsertBuilder,
|
||||
new_data: DATA,
|
||||
on_bad_vectors: str,
|
||||
fill_value: float,
|
||||
):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def delete(self, where: str):
|
||||
"""Delete rows from the table.
|
||||
|
||||
This can be used to delete a single row, many rows, all rows, or
|
||||
sometimes no rows (if your predicate matches nothing).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
where: str
|
||||
The SQL where clause to use when deleting rows.
|
||||
|
||||
- For example, 'x = 2' or 'x IN (1, 2, 3)'.
|
||||
|
||||
The filter must not be empty, or it will error.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import lancedb
|
||||
>>> data = [
|
||||
... {"x": 1, "vector": [1, 2]},
|
||||
... {"x": 2, "vector": [3, 4]},
|
||||
... {"x": 3, "vector": [5, 6]}
|
||||
... ]
|
||||
>>> db = lancedb.connect("./.lancedb")
|
||||
>>> table = db.create_table("my_table", data)
|
||||
>>> table.to_pandas()
|
||||
x vector
|
||||
0 1 [1.0, 2.0]
|
||||
1 2 [3.0, 4.0]
|
||||
2 3 [5.0, 6.0]
|
||||
>>> table.delete("x = 2")
|
||||
>>> table.to_pandas()
|
||||
x vector
|
||||
0 1 [1.0, 2.0]
|
||||
1 3 [5.0, 6.0]
|
||||
|
||||
If you have a list of values to delete, you can combine them into a
|
||||
stringified list and use the `IN` operator:
|
||||
|
||||
>>> to_remove = [1, 5]
|
||||
>>> to_remove = ", ".join([str(v) for v in to_remove])
|
||||
>>> to_remove
|
||||
'1, 5'
|
||||
>>> table.delete(f"x IN ({to_remove})")
|
||||
>>> table.to_pandas()
|
||||
x vector
|
||||
0 3 [5.0, 6.0]
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def update(
|
||||
self,
|
||||
where: Optional[str] = None,
|
||||
values: Optional[dict] = None,
|
||||
*,
|
||||
values_sql: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
This can be used to update zero to all rows depending on how many
|
||||
rows match the where clause. If no where clause is provided, then
|
||||
all rows will be updated.
|
||||
|
||||
Either `values` or `values_sql` must be provided. You cannot provide
|
||||
both.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
where: str, optional
|
||||
The SQL where clause to use when updating rows. For example, 'x = 2'
|
||||
or 'x IN (1, 2, 3)'. The filter must not be empty, or it will error.
|
||||
values: dict, optional
|
||||
The values to update. The keys are the column names and the values
|
||||
are the values to set.
|
||||
values_sql: dict, optional
|
||||
The values to update, expressed as SQL expression strings. These can
|
||||
reference existing columns. For example, {"x": "x + 1"} will increment
|
||||
the x column by 1.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import lancedb
|
||||
>>> import pandas as pd
|
||||
>>> data = pd.DataFrame({"x": [1, 2, 3], "vector": [[1, 2], [3, 4], [5, 6]]})
|
||||
>>> db = lancedb.connect("./.lancedb")
|
||||
>>> table = db.create_table("my_table", data)
|
||||
>>> table.to_pandas()
|
||||
x vector
|
||||
0 1 [1.0, 2.0]
|
||||
1 2 [3.0, 4.0]
|
||||
2 3 [5.0, 6.0]
|
||||
>>> table.update(where="x = 2", values={"vector": [10, 10]})
|
||||
>>> table.to_pandas()
|
||||
x vector
|
||||
0 1 [1.0, 2.0]
|
||||
1 3 [5.0, 6.0]
|
||||
2 2 [10.0, 10.0]
|
||||
>>> table.update(values_sql={"x": "x + 1"})
|
||||
>>> table.to_pandas()
|
||||
x vector
|
||||
0 2 [1.0, 2.0]
|
||||
1 4 [5.0, 6.0]
|
||||
2 3 [10.0, 10.0]
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def cleanup_old_versions(
|
||||
self,
|
||||
older_than: Optional[timedelta] = None,
|
||||
*,
|
||||
delete_unverified: bool = False,
|
||||
) -> CleanupStats:
|
||||
"""
|
||||
Clean up old versions of the table, freeing disk space.
|
||||
|
||||
Note: This function is not available in LanceDb Cloud (since LanceDb
|
||||
Cloud manages cleanup for you automatically)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
older_than: timedelta, default None
|
||||
The minimum age of the version to delete. If None, then this defaults
|
||||
to two weeks.
|
||||
delete_unverified: bool, default False
|
||||
Because they may be part of an in-progress transaction, files newer
|
||||
than 7 days old are not deleted by default. If you are sure that
|
||||
there are no in-progress transactions, then you can set this to True
|
||||
to delete all files older than `older_than`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
CleanupStats
|
||||
The stats of the cleanup operation, including how many bytes were
|
||||
freed.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def compact_files(self, *args, **kwargs):
|
||||
"""
|
||||
Run the compaction process on the table.
|
||||
|
||||
Note: This function is not available in LanceDb Cloud (since LanceDb
|
||||
Cloud manages compaction for you automatically)
|
||||
|
||||
This can be run after making several small appends to optimize the table
|
||||
for faster reads.
|
||||
|
||||
Arguments are passed onto :meth:`lance.dataset.DatasetOptimizer.compact_files`.
|
||||
For most cases, the default should be fine.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def add_columns(self, transforms: Dict[str, str]):
|
||||
"""
|
||||
Add new columns with defined values.
|
||||
|
||||
This is not yet available in LanceDB Cloud.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
transforms: Dict[str, str]
|
||||
A map of column name to a SQL expression to use to calculate the
|
||||
value of the new column. These expressions will be evaluated for
|
||||
each row in the table, and can reference existing columns.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def alter_columns(self, alterations: Iterable[Dict[str, str]]):
|
||||
"""
|
||||
Alter column names and nullability.
|
||||
|
||||
This is not yet available in LanceDB Cloud.
|
||||
|
||||
alterations : Iterable[Dict[str, Any]]
|
||||
A sequence of dictionaries, each with the following keys:
|
||||
- "path": str
|
||||
The column path to alter. For a top-level column, this is the name.
|
||||
For a nested column, this is the dot-separated path, e.g. "a.b.c".
|
||||
- "name": str, optional
|
||||
The new name of the column. If not specified, the column name is
|
||||
not changed.
|
||||
- "nullable": bool, optional
|
||||
Whether the column should be nullable. If not specified, the column
|
||||
nullability is not changed. Only non-nullable columns can be changed
|
||||
to nullable. Currently, you cannot change a nullable column to
|
||||
non-nullable.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def drop_columns(self, columns: Iterable[str]):
|
||||
"""
|
||||
Drop columns from the table.
|
||||
|
||||
This is not yet available in LanceDB Cloud.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns : Iterable[str]
|
||||
The names of the columns to drop.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class AsyncLanceTable(AsyncTable):
|
||||
def __init__(self, table: LanceDBTable):
|
||||
self._inner = table
|
||||
|
||||
@property
|
||||
@override
|
||||
def name(self) -> str:
|
||||
return self._inner.name()
|
||||
|
||||
@override
|
||||
async def schema(self) -> pa.Schema:
|
||||
return await self._inner.schema()
|
||||
|
||||
@override
|
||||
async def count_rows(self, filter: Optional[str] = None) -> int:
|
||||
raise NotImplementedError
|
||||
|
||||
async def to_pandas(self) -> "pd.DataFrame":
|
||||
return self.to_arrow().to_pandas()
|
||||
|
||||
@override
|
||||
async def to_arrow(self) -> pa.Table:
|
||||
raise NotImplementedError
|
||||
|
||||
async def create_index(
|
||||
self,
|
||||
metric="L2",
|
||||
num_partitions=256,
|
||||
num_sub_vectors=96,
|
||||
vector_column_name: str = VECTOR_COLUMN_NAME,
|
||||
replace: bool = True,
|
||||
accelerator: Optional[str] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
):
|
||||
raise NotImplementedError
|
||||
|
||||
@override
|
||||
async def create_scalar_index(
|
||||
self,
|
||||
column: str,
|
||||
*,
|
||||
replace: bool = True,
|
||||
):
|
||||
raise NotImplementedError
|
||||
|
||||
@override
|
||||
async def add(
|
||||
self,
|
||||
data: DATA,
|
||||
mode: str = "append",
|
||||
on_bad_vectors: str = "error",
|
||||
fill_value: float = 0.0,
|
||||
):
|
||||
raise NotImplementedError
|
||||
|
||||
def merge_insert(self, on: Union[str, Iterable[str]]) -> LanceMergeInsertBuilder:
|
||||
on = [on] if isinstance(on, str) else list(on.iter())
|
||||
|
||||
return LanceMergeInsertBuilder(self, on)
|
||||
|
||||
@override
|
||||
async def search(
|
||||
self,
|
||||
query: Optional[Union[VEC, str, "PIL.Image.Image", Tuple]] = None,
|
||||
vector_column_name: Optional[str] = None,
|
||||
query_type: str = "auto",
|
||||
) -> LanceQueryBuilder:
|
||||
raise NotImplementedError
|
||||
|
||||
@override
|
||||
async def _execute_query(self, query: Query) -> pa.Table:
|
||||
pass
|
||||
|
||||
@override
|
||||
async def _do_merge(
|
||||
self,
|
||||
merge: LanceMergeInsertBuilder,
|
||||
new_data: DATA,
|
||||
on_bad_vectors: str,
|
||||
fill_value: float,
|
||||
):
|
||||
pass
|
||||
|
||||
@override
|
||||
async def delete(self, where: str):
|
||||
raise NotImplementedError
|
||||
|
||||
@override
|
||||
async def update(
|
||||
self,
|
||||
where: Optional[str] = None,
|
||||
values: Optional[dict] = None,
|
||||
*,
|
||||
values_sql: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
raise NotImplementedError
|
||||
|
||||
@override
|
||||
async def cleanup_old_versions(
|
||||
self,
|
||||
older_than: Optional[timedelta] = None,
|
||||
*,
|
||||
delete_unverified: bool = False,
|
||||
) -> CleanupStats:
|
||||
raise NotImplementedError
|
||||
|
||||
@override
|
||||
async def compact_files(self, *args, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
@override
|
||||
async def add_columns(self, transforms: Dict[str, str]):
|
||||
raise NotImplementedError
|
||||
|
||||
@override
|
||||
async def alter_columns(self, alterations: Iterable[Dict[str, str]]):
|
||||
raise NotImplementedError
|
||||
|
||||
@override
|
||||
async def drop_columns(self, columns: Iterable[str]):
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -250,6 +250,78 @@ def test_create_exist_ok(tmp_path):
|
||||
db.create_table("test", schema=bad_schema, exist_ok=True)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_mode_async(tmp_path):
|
||||
db = await lancedb.connect_async(tmp_path)
|
||||
data = pd.DataFrame(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0],
|
||||
}
|
||||
)
|
||||
await db.create_table("test", data=data)
|
||||
|
||||
with pytest.raises(RuntimeError):
|
||||
await db.create_table("test", data=data)
|
||||
|
||||
new_data = pd.DataFrame(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["fizz", "buzz"],
|
||||
"price": [10.0, 20.0],
|
||||
}
|
||||
)
|
||||
_tbl = await db.create_table("test", data=new_data, mode="overwrite")
|
||||
|
||||
# MIGRATION: to_pandas() is not available in async
|
||||
# assert tbl.to_pandas().item.tolist() == ["fizz", "buzz"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_exist_ok_async(tmp_path):
|
||||
db = await lancedb.connect_async(tmp_path)
|
||||
data = pd.DataFrame(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0],
|
||||
}
|
||||
)
|
||||
tbl = await db.create_table("test", data=data)
|
||||
|
||||
with pytest.raises(RuntimeError):
|
||||
await db.create_table("test", data=data)
|
||||
|
||||
# open the table but don't add more rows
|
||||
tbl2 = await db.create_table("test", data=data, exist_ok=True)
|
||||
assert tbl.name == tbl2.name
|
||||
assert await tbl.schema() == await tbl2.schema()
|
||||
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), list_size=2)),
|
||||
pa.field("item", pa.utf8()),
|
||||
pa.field("price", pa.float64()),
|
||||
]
|
||||
)
|
||||
tbl3 = await db.create_table("test", schema=schema, exist_ok=True)
|
||||
assert await tbl3.schema() == schema
|
||||
|
||||
# Migration: When creating a table, but the table already exists, but
|
||||
# the schema is different, it should raise an error.
|
||||
# bad_schema = pa.schema(
|
||||
# [
|
||||
# pa.field("vector", pa.list_(pa.float32(), list_size=2)),
|
||||
# pa.field("item", pa.utf8()),
|
||||
# pa.field("price", pa.float64()),
|
||||
# pa.field("extra", pa.float32()),
|
||||
# ]
|
||||
# )
|
||||
# with pytest.raises(ValueError):
|
||||
# await db.create_table("test", schema=bad_schema, exist_ok=True)
|
||||
|
||||
|
||||
def test_delete_table(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
data = pd.DataFrame(
|
||||
|
||||
@@ -12,19 +12,33 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::time::Duration;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use lancedb::connection::Connection as LanceConnection;
|
||||
use pyo3::{pyclass, pyfunction, pymethods, PyAny, PyRef, PyResult, Python};
|
||||
use arrow::{datatypes::Schema, ffi_stream::ArrowArrayStreamReader, pyarrow::FromPyArrow};
|
||||
use lancedb::connection::{Connection as LanceConnection, CreateTableMode};
|
||||
use pyo3::{
|
||||
exceptions::PyValueError, pyclass, pyfunction, pymethods, PyAny, PyRef, PyResult, Python,
|
||||
};
|
||||
use pyo3_asyncio::tokio::future_into_py;
|
||||
|
||||
use crate::error::PythonErrorExt;
|
||||
use crate::{error::PythonErrorExt, table::Table};
|
||||
|
||||
#[pyclass]
|
||||
pub struct Connection {
|
||||
inner: LanceConnection,
|
||||
}
|
||||
|
||||
impl Connection {
|
||||
fn parse_create_mode_str(mode: &str) -> PyResult<CreateTableMode> {
|
||||
match mode {
|
||||
"create" => Ok(CreateTableMode::Create),
|
||||
"overwrite" => Ok(CreateTableMode::Overwrite),
|
||||
"exist_ok" => Ok(CreateTableMode::exist_ok(|builder| builder)),
|
||||
_ => Err(PyValueError::new_err(format!("Invalid mode {}", mode))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Connection {
|
||||
pub fn table_names(self_: PyRef<'_, Self>) -> PyResult<&PyAny> {
|
||||
@@ -33,6 +47,51 @@ impl Connection {
|
||||
inner.table_names().await.infer_error()
|
||||
})
|
||||
}
|
||||
|
||||
pub fn create_table<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
name: String,
|
||||
mode: &str,
|
||||
data: &PyAny,
|
||||
) -> PyResult<&'a PyAny> {
|
||||
let inner = self_.inner.clone();
|
||||
|
||||
let mode = Self::parse_create_mode_str(mode)?;
|
||||
|
||||
let batches = Box::new(ArrowArrayStreamReader::from_pyarrow(data)?);
|
||||
future_into_py(self_.py(), async move {
|
||||
let table = inner
|
||||
.create_table(name, batches)
|
||||
.mode(mode)
|
||||
.execute()
|
||||
.await
|
||||
.infer_error()?;
|
||||
Ok(Table::new(table))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn create_empty_table<'a>(
|
||||
self_: PyRef<'a, Self>,
|
||||
name: String,
|
||||
mode: &str,
|
||||
schema: &PyAny,
|
||||
) -> PyResult<&'a PyAny> {
|
||||
let inner = self_.inner.clone();
|
||||
|
||||
let mode = Self::parse_create_mode_str(mode)?;
|
||||
|
||||
let schema = Schema::from_pyarrow(schema)?;
|
||||
|
||||
future_into_py(self_.py(), async move {
|
||||
let table = inner
|
||||
.create_empty_table(name, Arc::new(schema))
|
||||
.mode(mode)
|
||||
.execute()
|
||||
.await
|
||||
.infer_error()?;
|
||||
Ok(Table::new(table))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[pyfunction]
|
||||
|
||||
@@ -45,6 +45,7 @@ impl<T> PythonErrorExt<T> for std::result::Result<T, LanceError> {
|
||||
LanceError::Lance { .. } => self.runtime_error(),
|
||||
LanceError::Runtime { .. } => self.runtime_error(),
|
||||
LanceError::Http { .. } => self.runtime_error(),
|
||||
LanceError::Arrow { .. } => self.runtime_error(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,7 +17,8 @@ use env_logger::Env;
|
||||
use pyo3::{pymodule, types::PyModule, wrap_pyfunction, PyResult, Python};
|
||||
|
||||
pub mod connection;
|
||||
pub(crate) mod error;
|
||||
pub mod error;
|
||||
pub mod table;
|
||||
|
||||
#[pymodule]
|
||||
pub fn _lancedb(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
|
||||
34
python/src/table.rs
Normal file
34
python/src/table.rs
Normal file
@@ -0,0 +1,34 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::pyarrow::ToPyArrow;
|
||||
use lancedb::table::Table as LanceTable;
|
||||
use pyo3::{pyclass, pymethods, PyAny, PyRef, PyResult, Python};
|
||||
use pyo3_asyncio::tokio::future_into_py;
|
||||
|
||||
use crate::error::PythonErrorExt;
|
||||
|
||||
#[pyclass]
|
||||
pub struct Table {
|
||||
inner: Arc<dyn LanceTable>,
|
||||
}
|
||||
|
||||
impl Table {
|
||||
pub(crate) fn new(inner: Arc<dyn LanceTable>) -> Self {
|
||||
Self { inner }
|
||||
}
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Table {
|
||||
pub fn name(&self) -> String {
|
||||
self.inner.name().to_string()
|
||||
}
|
||||
|
||||
pub fn schema(self_: PyRef<'_, Self>) -> PyResult<&PyAny> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let schema = inner.schema().await.infer_error()?;
|
||||
Python::with_gil(|py| schema.to_pyarrow(py))
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user