mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-14 10:30:40 +00:00
docs: add async examples to doc (#1941)
- added sync and async tabs for python examples - moved python code to tests/docs --------- Co-authored-by: Will Jones <willjones127@gmail.com>
This commit is contained in:
@@ -8,54 +8,55 @@ and PyArrow. The sequence of steps in a typical workflow is shown below.
|
||||
|
||||
First, we need to connect to a LanceDB database.
|
||||
|
||||
```py
|
||||
=== "Sync API"
|
||||
|
||||
import lancedb
|
||||
```python
|
||||
--8<-- "python/python/tests/docs/test_python.py:import-lancedb"
|
||||
--8<-- "python/python/tests/docs/test_python.py:connect_to_lancedb"
|
||||
```
|
||||
=== "Async API"
|
||||
|
||||
db = lancedb.connect("data/sample-lancedb")
|
||||
```
|
||||
```python
|
||||
--8<-- "python/python/tests/docs/test_python.py:import-lancedb"
|
||||
--8<-- "python/python/tests/docs/test_python.py:connect_to_lancedb_async"
|
||||
```
|
||||
|
||||
We can load a Pandas `DataFrame` to LanceDB directly.
|
||||
|
||||
```py
|
||||
import pandas as pd
|
||||
=== "Sync API"
|
||||
|
||||
data = pd.DataFrame({
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0]
|
||||
})
|
||||
table = db.create_table("pd_table", data=data)
|
||||
```
|
||||
```python
|
||||
--8<-- "python/python/tests/docs/test_python.py:import-pandas"
|
||||
--8<-- "python/python/tests/docs/test_python.py:create_table_pandas"
|
||||
```
|
||||
=== "Async API"
|
||||
|
||||
```python
|
||||
--8<-- "python/python/tests/docs/test_python.py:import-pandas"
|
||||
--8<-- "python/python/tests/docs/test_python.py:create_table_pandas_async"
|
||||
```
|
||||
|
||||
Similar to the [`pyarrow.write_dataset()`](https://arrow.apache.org/docs/python/generated/pyarrow.dataset.write_dataset.html) method, LanceDB's
|
||||
[`db.create_table()`](python.md/#lancedb.db.DBConnection.create_table) accepts data in a variety of forms.
|
||||
|
||||
If you have a dataset that is larger than memory, you can create a table with `Iterator[pyarrow.RecordBatch]` to lazily load the data:
|
||||
|
||||
```py
|
||||
=== "Sync API"
|
||||
|
||||
from typing import Iterable
|
||||
import pyarrow as pa
|
||||
```python
|
||||
--8<-- "python/python/tests/docs/test_python.py:import-iterable"
|
||||
--8<-- "python/python/tests/docs/test_python.py:import-pyarrow"
|
||||
--8<-- "python/python/tests/docs/test_python.py:make_batches"
|
||||
--8<-- "python/python/tests/docs/test_python.py:create_table_iterable"
|
||||
```
|
||||
=== "Async API"
|
||||
|
||||
def make_batches() -> Iterable[pa.RecordBatch]:
|
||||
for i in range(5):
|
||||
yield pa.RecordBatch.from_arrays(
|
||||
[
|
||||
pa.array([[3.1, 4.1], [5.9, 26.5]]),
|
||||
pa.array(["foo", "bar"]),
|
||||
pa.array([10.0, 20.0]),
|
||||
],
|
||||
["vector", "item", "price"])
|
||||
|
||||
schema=pa.schema([
|
||||
pa.field("vector", pa.list_(pa.float32())),
|
||||
pa.field("item", pa.utf8()),
|
||||
pa.field("price", pa.float32()),
|
||||
])
|
||||
|
||||
table = db.create_table("iterable_table", data=make_batches(), schema=schema)
|
||||
```
|
||||
```python
|
||||
--8<-- "python/python/tests/docs/test_python.py:import-iterable"
|
||||
--8<-- "python/python/tests/docs/test_python.py:import-pyarrow"
|
||||
--8<-- "python/python/tests/docs/test_python.py:make_batches"
|
||||
--8<-- "python/python/tests/docs/test_python.py:create_table_iterable_async"
|
||||
```
|
||||
|
||||
You will find detailed instructions of creating a LanceDB dataset in
|
||||
[Getting Started](../basic.md#quick-start) and [API](python.md/#lancedb.db.DBConnection.create_table)
|
||||
@@ -65,15 +66,16 @@ sections.
|
||||
|
||||
We can now perform similarity search via the LanceDB Python API.
|
||||
|
||||
```py
|
||||
# Open the table previously created.
|
||||
table = db.open_table("pd_table")
|
||||
=== "Sync API"
|
||||
|
||||
query_vector = [100, 100]
|
||||
# Pandas DataFrame
|
||||
df = table.search(query_vector).limit(1).to_pandas()
|
||||
print(df)
|
||||
```
|
||||
```python
|
||||
--8<-- "python/python/tests/docs/test_python.py:vector_search"
|
||||
```
|
||||
=== "Async API"
|
||||
|
||||
```python
|
||||
--8<-- "python/python/tests/docs/test_python.py:vector_search_async"
|
||||
```
|
||||
|
||||
```
|
||||
vector item price _distance
|
||||
@@ -83,16 +85,13 @@ print(df)
|
||||
If you have a simple filter, it's faster to provide a `where` clause to LanceDB's `search` method.
|
||||
For more complex filters or aggregations, you can always resort to using the underlying `DataFrame` methods after performing a search.
|
||||
|
||||
```python
|
||||
=== "Sync API"
|
||||
|
||||
# Apply the filter via LanceDB
|
||||
results = table.search([100, 100]).where("price < 15").to_pandas()
|
||||
assert len(results) == 1
|
||||
assert results["item"].iloc[0] == "foo"
|
||||
```python
|
||||
--8<-- "python/python/tests/docs/test_python.py:vector_search_with_filter"
|
||||
```
|
||||
=== "Async API"
|
||||
|
||||
# Apply the filter via Pandas
|
||||
df = results = table.search([100, 100]).to_pandas()
|
||||
results = df[df.price < 15]
|
||||
assert len(results) == 1
|
||||
assert results["item"].iloc[0] == "foo"
|
||||
```
|
||||
```python
|
||||
--8<-- "python/python/tests/docs/test_python.py:vector_search_with_filter_async"
|
||||
```
|
||||
|
||||
@@ -2,38 +2,29 @@
|
||||
|
||||
LanceDB supports [Polars](https://github.com/pola-rs/polars), a blazingly fast DataFrame library for Python written in Rust. Just like in Pandas, the Polars integration is enabled by PyArrow under the hood. A deeper integration between Lance Tables and Polars DataFrames is in progress, but at the moment, you can read a Polars DataFrame into LanceDB and output the search results from a query to a Polars DataFrame.
|
||||
|
||||
|
||||
## Create & Query LanceDB Table
|
||||
|
||||
### From Polars DataFrame
|
||||
|
||||
First, we connect to a LanceDB database.
|
||||
|
||||
```py
|
||||
import lancedb
|
||||
|
||||
db = lancedb.connect("data/polars-lancedb")
|
||||
```py
|
||||
--8<-- "python/python/tests/docs/test_python.py:import-lancedb"
|
||||
--8<-- "python/python/tests/docs/test_python.py:connect_to_lancedb"
|
||||
```
|
||||
|
||||
We can load a Polars `DataFrame` to LanceDB directly.
|
||||
|
||||
```py
|
||||
import polars as pl
|
||||
|
||||
data = pl.DataFrame({
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0]
|
||||
})
|
||||
table = db.create_table("pl_table", data=data)
|
||||
--8<-- "python/python/tests/docs/test_python.py:import-polars"
|
||||
--8<-- "python/python/tests/docs/test_python.py:create_table_polars"
|
||||
```
|
||||
|
||||
We can now perform similarity search via the LanceDB Python API.
|
||||
|
||||
```py
|
||||
query = [3.0, 4.0]
|
||||
result = table.search(query).limit(1).to_polars()
|
||||
print(result)
|
||||
print(type(result))
|
||||
--8<-- "python/python/tests/docs/test_python.py:vector_search_polars"
|
||||
```
|
||||
|
||||
In addition to the selected columns, LanceDB also returns a vector
|
||||
@@ -59,33 +50,16 @@ Note that the type of the result from a table search is a Polars DataFrame.
|
||||
Alternately, we can create an empty LanceDB Table using a Pydantic schema and populate it with a Polars DataFrame.
|
||||
|
||||
```py
|
||||
import polars as pl
|
||||
from lancedb.pydantic import Vector, LanceModel
|
||||
|
||||
|
||||
class Item(LanceModel):
|
||||
vector: Vector(2)
|
||||
item: str
|
||||
price: float
|
||||
|
||||
data = {
|
||||
"vector": [[3.1, 4.1]],
|
||||
"item": "foo",
|
||||
"price": 10.0,
|
||||
}
|
||||
|
||||
table = db.create_table("test_table", schema=Item)
|
||||
df = pl.DataFrame(data)
|
||||
# Add Polars DataFrame to table
|
||||
table.add(df)
|
||||
--8<-- "python/python/tests/docs/test_python.py:import-polars"
|
||||
--8<-- "python/python/tests/docs/test_python.py:import-lancedb-pydantic"
|
||||
--8<-- "python/python/tests/docs/test_python.py:class_Item"
|
||||
--8<-- "python/python/tests/docs/test_python.py:create_table_pydantic"
|
||||
```
|
||||
|
||||
The table can now be queried as usual.
|
||||
|
||||
```py
|
||||
result = table.search([3.0, 4.0]).limit(1).to_polars()
|
||||
print(result)
|
||||
print(type(result))
|
||||
--8<-- "python/python/tests/docs/test_python.py:vector_search_polars"
|
||||
```
|
||||
|
||||
```
|
||||
@@ -108,8 +82,7 @@ As you iterate on your application, you'll likely need to work with the whole ta
|
||||
LanceDB tables can also be converted directly into a polars LazyFrame for further processing.
|
||||
|
||||
```python
|
||||
ldf = table.to_polars()
|
||||
print(type(ldf))
|
||||
--8<-- "python/python/tests/docs/test_python.py:dump_table_lazyform"
|
||||
```
|
||||
|
||||
Unlike the search result from a query, we can see that the type of the result is a LazyFrame.
|
||||
@@ -121,7 +94,7 @@ Unlike the search result from a query, we can see that the type of the result is
|
||||
We can now work with the LazyFrame as we would in Polars, and collect the first result.
|
||||
|
||||
```python
|
||||
print(ldf.first().collect())
|
||||
--8<-- "python/python/tests/docs/test_python.py:print_table_lazyform"
|
||||
```
|
||||
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user