docs: add async examples to doc (#1941)

- added sync and async tabs for python examples
- moved python code to tests/docs

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
This commit is contained in:
QianZhu
2025-01-07 15:10:25 -08:00
committed by GitHub
parent 0b45ef93c0
commit 17c9e9afea
21 changed files with 3639 additions and 987 deletions

View File

@@ -125,7 +125,7 @@ async def test_quickstart_async():
# --8<-- [start:create_table_async]
# Asynchronous client
async_tbl = await async_db.create_table("my_table2", data=data)
async_tbl = await async_db.create_table("my_table_async", data=data)
# --8<-- [end:create_table_async]
df = pd.DataFrame(
@@ -137,17 +137,17 @@ async def test_quickstart_async():
# --8<-- [start:create_table_async_pandas]
# Asynchronous client
async_tbl = await async_db.create_table("table_from_df2", df)
async_tbl = await async_db.create_table("table_from_df_async", df)
# --8<-- [end:create_table_async_pandas]
schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), list_size=2))])
# --8<-- [start:create_empty_table_async]
# Asynchronous client
async_tbl = await async_db.create_table("empty_table2", schema=schema)
async_tbl = await async_db.create_table("empty_table_async", schema=schema)
# --8<-- [end:create_empty_table_async]
# --8<-- [start:open_table_async]
# Asynchronous client
async_tbl = await async_db.open_table("my_table2")
async_tbl = await async_db.open_table("my_table_async")
# --8<-- [end:open_table_async]
# --8<-- [start:table_names_async]
# Asynchronous client
@@ -161,6 +161,22 @@ async def test_quickstart_async():
data = [{"vector": [x, x], "item": "filler", "price": x * x} for x in range(1000)]
await async_tbl.add(data)
# --8<-- [start:vector_search_async]
# --8<-- [start:add_columns_async]
await async_tbl.add_columns({"double_price": "cast((price * 2) as float)"})
# --8<-- [end:add_columns_async]
# --8<-- [start:alter_columns_async]
await async_tbl.alter_columns(
{
"path": "double_price",
"rename": "dbl_price",
"data_type": pa.float64(),
"nullable": True,
}
)
# --8<-- [end:alter_columns_async]
# --8<-- [start:drop_columns_async]
await async_tbl.drop_columns(["dbl_price"])
# --8<-- [end:drop_columns_async]
# Asynchronous client
await async_tbl.vector_search([100, 100]).limit(2).to_pandas()
# --8<-- [end:vector_search_async]
@@ -174,5 +190,5 @@ async def test_quickstart_async():
# --8<-- [end:delete_rows_async]
# --8<-- [start:drop_table_async]
# Asynchronous client
await async_db.drop_table("my_table2")
await async_db.drop_table("my_table_async")
# --8<-- [end:drop_table_async]

View File

@@ -0,0 +1,169 @@
# --8<-- [start:import-lancedb]
import lancedb
# --8<-- [end:import-lancedb]
# --8<-- [start:import-lancedb-ivfpq]
from lancedb.index import IvfPq
# --8<-- [end:import-lancedb-ivfpq]
# --8<-- [start:import-lancedb-btree-bitmap]
from lancedb.index import BTree, Bitmap
# --8<-- [end:import-lancedb-btree-bitmap]
# --8<-- [start:import-numpy]
import numpy as np
# --8<-- [end:import-numpy]
import pytest
def test_ann_index():
# --8<-- [start:create_ann_index]
uri = "data/sample-lancedb"
# Create 5,000 sample vectors
data = [
{"vector": row, "item": f"item {i}"}
for i, row in enumerate(np.random.random((5_000, 32)).astype("float32"))
]
db = lancedb.connect(uri)
# Add the vectors to a table
tbl = db.create_table("my_vectors", data=data)
# Create and train the index - you need to have enough data in the table
# for an effective training step
tbl.create_index(num_partitions=2, num_sub_vectors=4)
# --8<-- [end:create_ann_index]
# --8<-- [start:vector_search]
tbl.search(np.random.random((32))).limit(2).nprobes(20).refine_factor(
10
).to_pandas()
# --8<-- [end:vector_search]
# --8<-- [start:vector_search_with_filter]
tbl.search(np.random.random((32))).where("item != 'item 1141'").to_pandas()
# --8<-- [end:vector_search_with_filter]
# --8<-- [start:vector_search_with_select]
tbl.search(np.random.random((32))).select(["vector"]).to_pandas()
# --8<-- [end:vector_search_with_select]
@pytest.mark.asyncio
async def test_ann_index_async():
# --8<-- [start:create_ann_index_async]
uri = "data/sample-lancedb"
# Create 5,000 sample vectors
data = [
{"vector": row, "item": f"item {i}"}
for i, row in enumerate(np.random.random((5_000, 32)).astype("float32"))
]
async_db = await lancedb.connect_async(uri)
# Add the vectors to a table
async_tbl = await async_db.create_table("my_vectors_async", data=data)
# Create and train the index - you need to have enough data in the table
# for an effective training step
await async_tbl.create_index(
"vector", config=IvfPq(num_partitions=2, num_sub_vectors=4)
)
# --8<-- [end:create_ann_index_async]
# --8<-- [start:vector_search_async]
await (
async_tbl.query()
.nearest_to(np.random.random((32)))
.limit(2)
.nprobes(20)
.refine_factor(10)
.to_pandas()
)
# --8<-- [end:vector_search_async]
# --8<-- [start:vector_search_async_with_filter]
await (
async_tbl.query()
.nearest_to(np.random.random((32)))
.where("item != 'item 1141'")
.to_pandas()
)
# --8<-- [end:vector_search_async_with_filter]
# --8<-- [start:vector_search_async_with_select]
await (
async_tbl.query()
.nearest_to(np.random.random((32)))
.select(["vector"])
.to_pandas()
)
# --8<-- [end:vector_search_async_with_select]
def test_scalar_index():
# --8<-- [start:basic_scalar_index]
uri = "data/sample-lancedb"
db = lancedb.connect(uri)
books = [
{
"book_id": 1,
"publisher": "plenty of books",
"tags": ["fantasy", "adventure"],
},
{"book_id": 2, "publisher": "book town", "tags": ["non-fiction"]},
{"book_id": 3, "publisher": "oreilly", "tags": ["textbook"]},
]
table = db.create_table("books", books)
table.create_scalar_index("book_id") # BTree by default
table.create_scalar_index("publisher", index_type="BITMAP")
# --8<-- [end:basic_scalar_index]
# --8<-- [start:search_with_scalar_index]
table = db.open_table("books")
table.search().where("book_id = 2").to_pandas()
# --8<-- [end:search_with_scalar_index]
# --8<-- [start:vector_search_with_scalar_index]
data = [
{"book_id": 1, "vector": [1, 2]},
{"book_id": 2, "vector": [3, 4]},
{"book_id": 3, "vector": [5, 6]},
]
table = db.create_table("book_with_embeddings", data)
(table.search([1, 2]).where("book_id != 3", prefilter=True).to_pandas())
# --8<-- [end:vector_search_with_scalar_index]
# --8<-- [start:update_scalar_index]
table.add([{"vector": [7, 8], "book_id": 4}])
table.optimize()
# --8<-- [end:update_scalar_index]
@pytest.mark.asyncio
async def test_scalar_index_async():
# --8<-- [start:basic_scalar_index_async]
uri = "data/sample-lancedb"
async_db = await lancedb.connect_async(uri)
books = [
{
"book_id": 1,
"publisher": "plenty of books",
"tags": ["fantasy", "adventure"],
},
{"book_id": 2, "publisher": "book town", "tags": ["non-fiction"]},
{"book_id": 3, "publisher": "oreilly", "tags": ["textbook"]},
]
async_tbl = await async_db.create_table("books_async", books)
await async_tbl.create_index("book_id", config=BTree()) # BTree by default
await async_tbl.create_index("publisher", config=Bitmap())
# --8<-- [end:basic_scalar_index_async]
# --8<-- [start:search_with_scalar_index_async]
async_tbl = await async_db.open_table("books_async")
await async_tbl.query().where("book_id = 2").to_pandas()
# --8<-- [end:search_with_scalar_index_async]
# --8<-- [start:vector_search_with_scalar_index_async]
data = [
{"book_id": 1, "vector": [1, 2]},
{"book_id": 2, "vector": [3, 4]},
{"book_id": 3, "vector": [5, 6]},
]
async_tbl = await async_db.create_table("book_with_embeddings_async", data)
(await async_tbl.query().where("book_id != 3").nearest_to([1, 2]).to_pandas())
# --8<-- [end:vector_search_with_scalar_index_async]
# --8<-- [start:update_scalar_index_async]
await async_tbl.add([{"vector": [7, 8], "book_id": 4}])
await async_tbl.optimize()
# --8<-- [end:update_scalar_index_async]

View File

@@ -0,0 +1,576 @@
# --8<-- [start:import-lancedb]
import lancedb
# --8<-- [end:import-lancedb]
# --8<-- [start:import-pandas]
import pandas as pd
# --8<-- [end:import-pandas]
# --8<-- [start:import-pyarrow]
import pyarrow as pa
# --8<-- [end:import-pyarrow]
# --8<-- [start:import-polars]
import polars as pl
# --8<-- [end:import-polars]
# --8<-- [start:import-numpy]
import numpy as np
# --8<-- [end:import-numpy]
# --8<-- [start:import-lancedb-pydantic]
from lancedb.pydantic import Vector, LanceModel
# --8<-- [end:import-lancedb-pydantic]
# --8<-- [start:import-datetime]
from datetime import timedelta
# --8<-- [end:import-datetime]
# --8<-- [start:import-embeddings]
from lancedb.embeddings import get_registry
# --8<-- [end:import-embeddings]
# --8<-- [start:import-pydantic-basemodel]
from pydantic import BaseModel
# --8<-- [end:import-pydantic-basemodel]
import pytest
# --8<-- [start:class-Content]
class Content(LanceModel):
movie_id: int
vector: Vector(128)
genres: str
title: str
imdb_id: int
@property
def imdb_url(self) -> str:
return f"https://www.imdb.com/title/tt{self.imdb_id}"
# --8<-- [end:class-Content]
# --8<-- [start:class-Document]
class Document(BaseModel):
content: str
source: str
# --8<-- [end:class-Document]
# --8<-- [start:class-NestedSchema]
class NestedSchema(LanceModel):
id: str
vector: Vector(1536)
document: Document
# --8<-- [end:class-NestedSchema]
# --8<-- [start:class-Item]
class Item(LanceModel):
vector: Vector(2)
item: str
price: float
# --8<-- [end:class-Item]
# --8<-- [start:make_batches]
def make_batches():
for i in range(5):
yield pa.RecordBatch.from_arrays(
[
pa.array(
[[3.1, 4.1, 5.1, 6.1], [5.9, 26.5, 4.7, 32.8]],
pa.list_(pa.float32(), 4),
),
pa.array(["foo", "bar"]),
pa.array([10.0, 20.0]),
],
["vector", "item", "price"],
)
# --8<-- [end:make_batches]
# --8<-- [start:make_batches_for_add]
def make_batches_for_add():
for i in range(5):
yield [
{"vector": [3.1, 4.1], "item": "peach", "price": 6.0},
{"vector": [5.9, 26.5], "item": "pear", "price": 5.0},
]
# --8<-- [end:make_batches_for_add]
def test_table():
# --8<-- [start:connect]
uri = "data/sample-lancedb"
db = lancedb.connect(uri)
# --8<-- [end:connect]
# --8<-- [start:create_table]
data = [
{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
{"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1},
]
db.create_table("test_table", data)
db["test_table"].head()
# --8<-- [end:create_table]
# --8<-- [start:create_table_exist_ok]
db.create_table("test_table", data, exist_ok=True)
# --8<-- [end:create_table_exist_ok]
# --8<-- [start:create_table_overwrite]
db.create_table("test_table", data, mode="overwrite")
# --8<-- [end:create_table_overwrite]
# --8<-- [start:create_table_from_pandas]
data = pd.DataFrame(
{
"vector": [[1.1, 1.2, 1.3, 1.4], [0.2, 1.8, 0.4, 3.6]],
"lat": [45.5, 40.1],
"long": [-122.7, -74.1],
}
)
db.create_table("my_table_pandas", data)
db["my_table_pandas"].head()
# --8<-- [end:create_table_from_pandas]
# --8<-- [start:create_table_custom_schema]
custom_schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 4)),
pa.field("lat", pa.float32()),
pa.field("long", pa.float32()),
]
)
tbl = db.create_table("my_table_custom_schema", data, schema=custom_schema)
# --8<-- [end:create_table_custom_schema]
# --8<-- [start:create_table_from_polars]
data = pl.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0],
}
)
tbl = db.create_table("my_table_pl", data)
# --8<-- [end:create_table_from_polars]
# --8<-- [start:create_table_from_arrow_table]
dim = 16
total = 2
schema = pa.schema(
[pa.field("vector", pa.list_(pa.float16(), dim)), pa.field("text", pa.string())]
)
data = pa.Table.from_arrays(
[
pa.array(
[np.random.randn(dim).astype(np.float16) for _ in range(total)],
pa.list_(pa.float16(), dim),
),
pa.array(["foo", "bar"]),
],
["vector", "text"],
)
tbl = db.create_table("f16_tbl", data, schema=schema)
# --8<-- [end:create_table_from_arrow_table]
# --8<-- [start:create_table_from_pydantic]
tbl = db.create_table("movielens_small", schema=Content)
# --8<-- [end:create_table_from_pydantic]
# --8<-- [start:create_table_nested_schema]
tbl = db.create_table("nested_table", schema=NestedSchema)
# --8<-- [end:create_table_nested_schema]
# --8<-- [start:create_table_from_batch]
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 4)),
pa.field("item", pa.utf8()),
pa.field("price", pa.float32()),
]
)
db.create_table("batched_tale", make_batches(), schema=schema)
# --8<-- [end:create_table_from_batch]
# --8<-- [start:list_tables]
print(db.table_names())
# --8<-- [end:list_tables]
# --8<-- [start:open_table]
tbl = db.open_table("test_table")
# --8<-- [end:open_table]
# --8<-- [start:create_empty_table]
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 2)),
pa.field("item", pa.string()),
pa.field("price", pa.float32()),
]
)
tbl = db.create_table("test_empty_table", schema=schema)
# --8<-- [end:create_empty_table]
# --8<-- [start:create_empty_table_pydantic]
tbl = db.create_table("test_empty_table_new", schema=Item.to_arrow_schema())
# --8<-- [end:create_empty_table_pydantic]
# --8<-- [start:add_table_from_pandas]
df = pd.DataFrame(
{
"vector": [[1.3, 1.4], [9.5, 56.2]],
"item": ["banana", "apple"],
"price": [5.0, 7.0],
}
)
tbl.add(df)
# --8<-- [end:add_table_from_pandas]
# --8<-- [start:add_table_from_polars]
df = pl.DataFrame(
{
"vector": [[1.3, 1.4], [9.5, 56.2]],
"item": ["banana", "apple"],
"price": [5.0, 7.0],
}
)
tbl.add(df)
# --8<-- [end:add_table_from_polars]
# --8<-- [start:add_table_from_batch]
tbl.add(make_batches_for_add())
# --8<-- [end:add_table_from_batch]
# --8<-- [start:add_table_from_pyarrow]
pa_table = pa.Table.from_arrays(
[
pa.array([[9.1, 6.7], [9.9, 31.2]], pa.list_(pa.float32(), 2)),
pa.array(["mango", "orange"]),
pa.array([7.0, 4.0]),
],
["vector", "item", "price"],
)
tbl.add(pa_table)
# --8<-- [end:add_table_from_pyarrow]
# --8<-- [start:add_table_from_pydantic]
pydantic_model_items = [
Item(vector=[8.1, 4.7], item="pineapple", price=10.0),
Item(vector=[6.9, 9.3], item="avocado", price=9.0),
]
tbl.add(pydantic_model_items)
# --8<-- [end:add_table_from_pydantic]
# --8<-- [start:delete_row]
tbl.delete('item = "fizz"')
# --8<-- [end:delete_row]
# --8<-- [start:delete_specific_row]
data = [
{"x": 1, "vector": [1, 2]},
{"x": 2, "vector": [3, 4]},
{"x": 3, "vector": [5, 6]},
]
# Synchronous client
tbl = db.create_table("delete_row", data)
tbl.to_pandas()
# x vector
# 0 1 [1.0, 2.0]
# 1 2 [3.0, 4.0]
# 2 3 [5.0, 6.0]
tbl.delete("x = 2")
tbl.to_pandas()
# x vector
# 0 1 [1.0, 2.0]
# 1 3 [5.0, 6.0]
# --8<-- [end:delete_specific_row]
# --8<-- [start:delete_list_values]
to_remove = [1, 5]
to_remove = ", ".join(str(v) for v in to_remove)
tbl.delete(f"x IN ({to_remove})")
tbl.to_pandas()
# x vector
# 0 3 [5.0, 6.0]
# --8<-- [end:delete_list_values]
# --8<-- [start:update_table]
# Create a table from a pandas DataFrame
data = pd.DataFrame({"x": [1, 2, 3], "vector": [[1, 2], [3, 4], [5, 6]]})
tbl = db.create_table("test_table", data, mode="overwrite")
# Update the table where x = 2
tbl.update(where="x = 2", values={"vector": [10, 10]})
# Get the updated table as a pandas DataFrame
df = tbl.to_pandas()
print(df)
# --8<-- [end:update_table]
# --8<-- [start:update_table_sql]
# Update the table where x = 2
tbl.update(values_sql={"x": "x + 1"})
print(tbl.to_pandas())
# --8<-- [end:update_table_sql]
# --8<-- [start:table_strong_consistency]
uri = "data/sample-lancedb"
db = lancedb.connect(uri, read_consistency_interval=timedelta(0))
tbl = db.open_table("test_table")
# --8<-- [end:table_strong_consistency]
# --8<-- [start:table_eventual_consistency]
uri = "data/sample-lancedb"
db = lancedb.connect(uri, read_consistency_interval=timedelta(seconds=5))
tbl = db.open_table("test_table")
# --8<-- [end:table_eventual_consistency]
# --8<-- [start:table_checkout_latest]
tbl = db.open_table("test_table")
# (Other writes happen to my_table from another process)
# Check for updates
tbl.checkout_latest()
# --8<-- [end:table_checkout_latest]
@pytest.mark.skip
def test_table_with_embedding():
db = lancedb.connect("data/sample-lancedb")
# --8<-- [start:create_table_with_embedding]
embed_fcn = get_registry().get("huggingface").create(name="BAAI/bge-small-en-v1.5")
class Schema(LanceModel):
text: str = embed_fcn.SourceField()
vector: Vector(embed_fcn.ndims()) = embed_fcn.VectorField(default=None)
tbl = db.create_table("my_table_with_embedding", schema=Schema, mode="overwrite")
models = [Schema(text="hello"), Schema(text="world")]
tbl.add(models)
# --8<-- [end:create_table_with_embedding]
@pytest.mark.skip
async def test_table_with_embedding_async():
async_db = await lancedb.connect_async("data/sample-lancedb")
# --8<-- [start:create_table_async_with_embedding]
embed_fcn = get_registry().get("huggingface").create(name="BAAI/bge-small-en-v1.5")
class Schema(LanceModel):
text: str = embed_fcn.SourceField()
vector: Vector(embed_fcn.ndims()) = embed_fcn.VectorField(default=None)
async_tbl = await async_db.create_table(
"my_table_async_with_embedding", schema=Schema, mode="overwrite"
)
models = [Schema(text="hello"), Schema(text="world")]
await async_tbl.add(models)
# --8<-- [end:create_table_async_with_embedding]
@pytest.mark.asyncio
async def test_table_async():
# --8<-- [start:connect_async]
uri = "data/sample-lancedb"
async_db = await lancedb.connect_async(uri)
# --8<-- [end:connect_async]
# --8<-- [start:create_table_async]
data = [
{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
{"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1},
]
async_tbl = await async_db.create_table("test_table_async", data)
await async_tbl.head()
# --8<-- [end:create_table_async]
# --8<-- [start:create_table_async_exist_ok]
await async_db.create_table("test_table_async", data, exist_ok=True)
# --8<-- [end:create_table_async_exist_ok]
# --8<-- [start:create_table_async_overwrite]
await async_db.create_table("test_table_async", data, mode="overwrite")
# --8<-- [end:create_table_async_overwrite]
# --8<-- [start:create_table_async_from_pandas]
data = pd.DataFrame(
{
"vector": [[1.1, 1.2, 1.3, 1.4], [0.2, 1.8, 0.4, 3.6]],
"lat": [45.5, 40.1],
"long": [-122.7, -74.1],
}
)
async_tbl = await async_db.create_table("my_table_async_pd", data)
await async_tbl.head()
# --8<-- [end:create_table_async_from_pandas]
# --8<-- [start:create_table_async_custom_schema]
custom_schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 4)),
pa.field("lat", pa.float32()),
pa.field("long", pa.float32()),
]
)
async_tbl = await async_db.create_table(
"my_table_async_custom_schema", data, schema=custom_schema
)
# --8<-- [end:create_table_async_custom_schema]
# --8<-- [start:create_table_async_from_polars]
data = pl.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0],
}
)
async_tbl = await async_db.create_table("my_table_async_pl", data)
# --8<-- [end:create_table_async_from_polars]
# --8<-- [start:create_table_async_from_arrow_table]
dim = 16
total = 2
schema = pa.schema(
[pa.field("vector", pa.list_(pa.float16(), dim)), pa.field("text", pa.string())]
)
data = pa.Table.from_arrays(
[
pa.array(
[np.random.randn(dim).astype(np.float16) for _ in range(total)],
pa.list_(pa.float16(), dim),
),
pa.array(["foo", "bar"]),
],
["vector", "text"],
)
async_tbl = await async_db.create_table("f16_tbl_async", data, schema=schema)
# --8<-- [end:create_table_async_from_arrow_table]
# --8<-- [start:create_table_async_from_pydantic]
async_tbl = await async_db.create_table("movielens_small_async", schema=Content)
# --8<-- [end:create_table_async_from_pydantic]
# --8<-- [start:create_table_async_nested_schema]
async_tbl = await async_db.create_table("nested_table_async", schema=NestedSchema)
# --8<-- [end:create_table_async_nested_schema]
# --8<-- [start:create_table_async_from_batch]
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 4)),
pa.field("item", pa.utf8()),
pa.field("price", pa.float32()),
]
)
await async_db.create_table("batched_table", make_batches(), schema=schema)
# --8<-- [end:create_table_async_from_batch]
# --8<-- [start:list_tables_async]
print(await async_db.table_names())
# --8<-- [end:list_tables_async]
# --8<-- [start:open_table_async]
async_tbl = await async_db.open_table("test_table_async")
# --8<-- [end:open_table_async]
# --8<-- [start:create_empty_table_async]
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 2)),
pa.field("item", pa.string()),
pa.field("price", pa.float32()),
]
)
async_tbl = await async_db.create_table("test_empty_table_async", schema=schema)
# --8<-- [end:create_empty_table_async]
# --8<-- [start:create_empty_table_async_pydantic]
async_tbl = await async_db.create_table(
"test_empty_table_async_new", schema=Item.to_arrow_schema()
)
# --8<-- [end:create_empty_table_async_pydantic]
# --8<-- [start:add_table_async_from_pandas]
df = pd.DataFrame(
{
"vector": [[1.3, 1.4], [9.5, 56.2]],
"item": ["banana", "apple"],
"price": [5.0, 7.0],
}
)
await async_tbl.add(df)
# --8<-- [end:add_table_async_from_pandas]
# --8<-- [start:add_table_async_from_polars]
df = pl.DataFrame(
{
"vector": [[1.3, 1.4], [9.5, 56.2]],
"item": ["banana", "apple"],
"price": [5.0, 7.0],
}
)
await async_tbl.add(df)
# --8<-- [end:add_table_async_from_polars]
# --8<-- [start:add_table_async_from_batch]
await async_tbl.add(make_batches_for_add())
# --8<-- [end:add_table_async_from_batch]
# --8<-- [start:add_table_async_from_pyarrow]
pa_table = pa.Table.from_arrays(
[
pa.array([[9.1, 6.7], [9.9, 31.2]], pa.list_(pa.float32(), 2)),
pa.array(["mango", "orange"]),
pa.array([7.0, 4.0]),
],
["vector", "item", "price"],
)
await async_tbl.add(pa_table)
# --8<-- [end:add_table_async_from_pyarrow]
# --8<-- [start:add_table_async_from_pydantic]
pydantic_model_items = [
Item(vector=[8.1, 4.7], item="pineapple", price=10.0),
Item(vector=[6.9, 9.3], item="avocado", price=9.0),
]
await async_tbl.add(pydantic_model_items)
# --8<-- [end:add_table_async_from_pydantic]
# --8<-- [start:delete_row_async]
await async_tbl.delete('item = "fizz"')
# --8<-- [end:delete_row_async]
# --8<-- [start:delete_specific_row_async]
data = [
{"x": 1, "vector": [1, 2]},
{"x": 2, "vector": [3, 4]},
{"x": 3, "vector": [5, 6]},
]
async_db = await lancedb.connect_async(uri)
async_tbl = await async_db.create_table("delete_row_async", data)
await async_tbl.to_pandas()
# x vector
# 0 1 [1.0, 2.0]
# 1 2 [3.0, 4.0]
# 2 3 [5.0, 6.0]
await async_tbl.delete("x = 2")
await async_tbl.to_pandas()
# x vector
# 0 1 [1.0, 2.0]
# 1 3 [5.0, 6.0]
# --8<-- [end:delete_specific_row_async]
# --8<-- [start:delete_list_values_async]
to_remove = [1, 5]
to_remove = ", ".join(str(v) for v in to_remove)
await async_tbl.delete(f"x IN ({to_remove})")
await async_tbl.to_pandas()
# x vector
# 0 3 [5.0, 6.0]
# --8<-- [end:delete_list_values_async]
# --8<-- [start:update_table_async]
# Create a table from a pandas DataFrame
data = pd.DataFrame({"x": [1, 2, 3], "vector": [[1, 2], [3, 4], [5, 6]]})
async_tbl = await async_db.create_table("update_table_async", data)
# Update the table where x = 2
await async_tbl.update({"vector": [10, 10]}, where="x = 2")
# Get the updated table as a pandas DataFrame
df = await async_tbl.to_pandas()
# Print the DataFrame
print(df)
# --8<-- [end:update_table_async]
# --8<-- [start:update_table_sql_async]
# Update the table where x = 2
await async_tbl.update(updates_sql={"x": "x + 1"})
print(await async_tbl.to_pandas())
# --8<-- [end:update_table_sql_async]
# --8<-- [start:table_async_strong_consistency]
uri = "data/sample-lancedb"
async_db = await lancedb.connect_async(uri, read_consistency_interval=timedelta(0))
async_tbl = await async_db.open_table("test_table_async")
# --8<-- [end:table_async_strong_consistency]
# --8<-- [start:table_async_ventual_consistency]
uri = "data/sample-lancedb"
async_db = await lancedb.connect_async(
uri, read_consistency_interval=timedelta(seconds=5)
)
async_tbl = await async_db.open_table("test_table_async")
# --8<-- [end:table_async_eventual_consistency]
# --8<-- [start:table_async_checkout_latest]
async_tbl = await async_db.open_table("test_table_async")
# (Other writes happen to test_table_async from another process)
# Check for updates
await async_tbl.checkout_latest()
# --8<-- [end:table_async_checkout_latest]

View File

@@ -0,0 +1,187 @@
# --8<-- [start:import-lancedb]
import lancedb
# --8<-- [end:import-lancedb]
# --8<-- [start:import-pandas]
import pandas as pd
# --8<-- [end:import-pandas]
# --8<-- [start:import-iterable]
from typing import Iterable
# --8<-- [end:import-iterable]
# --8<-- [start:import-pyarrow]
import pyarrow as pa
# --8<-- [end:import-pyarrow]
# --8<-- [start:import-polars]
import polars as pl
# --8<-- [end:import-polars]
# --8<-- [start:import-lancedb-pydantic]
from lancedb.pydantic import Vector, LanceModel
# --8<-- [end:import-lancedb-pydantic]
import pytest
# --8<-- [start:make_batches]
def make_batches() -> Iterable[pa.RecordBatch]:
for i in range(5):
yield pa.RecordBatch.from_arrays(
[
pa.array([[3.1, 4.1], [5.9, 26.5]]),
pa.array(["foo", "bar"]),
pa.array([10.0, 20.0]),
],
["vector", "item", "price"],
)
# --8<-- [end:make_batches]
def test_pandas_and_pyarrow():
# --8<-- [start:connect_to_lancedb]
uri = "data/sample-lancedb"
db = lancedb.connect(uri)
# --8<-- [end:connect_to_lancedb]
# --8<-- [start:create_table_pandas]
data = pd.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0],
}
)
table = db.create_table("pd_table", data=data)
# --8<-- [end:create_table_pandas]
# --8<-- [start:create_table_iterable]
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32())),
pa.field("item", pa.utf8()),
pa.field("price", pa.float32()),
]
)
table = db.create_table("iterable_table", data=make_batches(), schema=schema)
# --8<-- [end:create_table_iterable]
# --8<-- [start:vector_search]
# Open the table previously created.
table = db.open_table("pd_table")
query_vector = [100, 100]
# Pandas DataFrame
df = table.search(query_vector).limit(1).to_pandas()
print(df)
# --8<-- [end:vector_search]
# --8<-- [start:vector_search_with_filter]
# Apply the filter via LanceDB
results = table.search([100, 100]).where("price < 15").to_pandas()
assert len(results) == 1
assert results["item"].iloc[0] == "foo"
# Apply the filter via Pandas
df = results = table.search([100, 100]).to_pandas()
results = df[df.price < 15]
assert len(results) == 1
assert results["item"].iloc[0] == "foo"
# --8<-- [end:vector_search_with_filter]
@pytest.mark.asyncio
async def test_pandas_and_pyarrow_async():
# --8<-- [start:connect_to_lancedb_async]
uri = "data/sample-lancedb"
async_db = await lancedb.connect_async(uri)
# --8<-- [end:connect_to_lancedb_async]
# --8<-- [start:create_table_pandas_async]
data = pd.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0],
}
)
await async_db.create_table("pd_table_async", data=data)
# --8<-- [end:create_table_pandas_async]
# --8<-- [start:create_table_iterable_async]
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32())),
pa.field("item", pa.utf8()),
pa.field("price", pa.float32()),
]
)
await async_db.create_table(
"iterable_table_async", data=make_batches(), schema=schema
)
# --8<-- [end:create_table_iterable_async]
# --8<-- [start:vector_search_async]
# Open the table previously created.
async_tbl = await async_db.open_table("pd_table_async")
query_vector = [100, 100]
# Pandas DataFrame
df = await async_tbl.query().nearest_to(query_vector).limit(1).to_pandas()
print(df)
# --8<-- [end:vector_search_async]
# --8<-- [start:vector_search_with_filter_async]
# Apply the filter via LanceDB
results = (
await async_tbl.query().nearest_to([100, 100]).where("price < 15").to_pandas()
)
assert len(results) == 1
assert results["item"].iloc[0] == "foo"
# Apply the filter via Pandas
df = results = await async_tbl.query().nearest_to([100, 100]).to_pandas()
results = df[df.price < 15]
assert len(results) == 1
assert results["item"].iloc[0] == "foo"
# --8<-- [end:vector_search_with_filter_async]
# --8<-- [start:class_Item]
class Item(LanceModel):
vector: Vector(2)
item: str
price: float
# --8<-- [end:class_Item]
def test_polars():
uri = "data/sample-lancedb"
db = lancedb.connect(uri)
# --8<-- [start:create_table_polars]
data = pl.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0],
}
)
table = db.create_table("pl_table", data=data)
# --8<-- [end:create_table_polars]
# --8<-- [start:vector_search_polars]
query = [3.0, 4.0]
result = table.search(query).limit(1).to_polars()
print(result)
print(type(result))
# --8<-- [end:vector_search_polars]
# --8<-- [start:create_table_pydantic]
table = db.create_table("pydantic_table", schema=Item)
df = pl.DataFrame(data)
# Add Polars DataFrame to table
table.add(df)
# --8<-- [end:create_table_pydantic]
# --8<-- [start:dump_table_lazyform]
ldf = table.to_polars()
print(type(ldf))
# --8<-- [end:dump_table_lazyform]
# --8<-- [start:print_table_lazyform]
print(ldf.first().collect())
# --8<-- [end:print_table_lazyform]

View File

@@ -0,0 +1,366 @@
# --8<-- [start:import-lancedb]
import lancedb
# --8<-- [end:import-lancedb]
# --8<-- [start:import-numpy]
import numpy as np
# --8<-- [end:import-numpy]
# --8<-- [start:import-datetime]
from datetime import datetime
# --8<-- [end:import-datetime]
# --8<-- [start:import-lancedb-pydantic]
from lancedb.pydantic import Vector, LanceModel
# --8<-- [end:import-lancedb-pydantic]
# --8<-- [start:import-pydantic-base-model]
from pydantic import BaseModel
# --8<-- [end:import-pydantic-base-model]
# --8<-- [start:import-lancedb-fts]
from lancedb.index import FTS
# --8<-- [end:import-lancedb-fts]
# --8<-- [start:import-os]
import os
# --8<-- [end:import-os]
# --8<-- [start:import-embeddings]
from lancedb.embeddings import get_registry
# --8<-- [end:import-embeddings]
import pytest
# --8<-- [start:class-definition]
class Metadata(BaseModel):
source: str
timestamp: datetime
class Document(BaseModel):
content: str
meta: Metadata
class LanceSchema(LanceModel):
id: str
vector: Vector(1536)
payload: Document
# --8<-- [end:class-definition]
def test_vector_search():
# --8<-- [start:exhaustive_search]
uri = "data/sample-lancedb"
db = lancedb.connect(uri)
data = [
{"vector": row, "item": f"item {i}"}
for i, row in enumerate(np.random.random((10_000, 1536)).astype("float32"))
]
tbl = db.create_table("vector_search", data=data)
tbl.search(np.random.random((1536))).limit(10).to_list()
# --8<-- [end:exhaustive_search]
# --8<-- [start:exhaustive_search_cosine]
tbl.search(np.random.random((1536))).metric("cosine").limit(10).to_list()
# --8<-- [end:exhaustive_search_cosine]
# --8<-- [start:create_table_with_nested_schema]
# Let's add 100 sample rows to our dataset
data = [
LanceSchema(
id=f"id{i}",
vector=np.random.randn(1536),
payload=Document(
content=f"document{i}",
meta=Metadata(source=f"source{i % 10}", timestamp=datetime.now()),
),
)
for i in range(100)
]
# Synchronous client
tbl = db.create_table("documents", data=data)
# --8<-- [end:create_table_with_nested_schema]
# --8<-- [start:search_result_as_pyarrow]
tbl.search(np.random.randn(1536)).to_arrow()
# --8<-- [end:search_result_as_pyarrow]
# --8<-- [start:search_result_as_pandas]
tbl.search(np.random.randn(1536)).to_pandas()
# --8<-- [end:search_result_as_pandas]
# --8<-- [start:search_result_as_pandas_flatten_true]
tbl.search(np.random.randn(1536)).to_pandas(flatten=True)
# --8<-- [end:search_result_as_pandas_flatten_true]
# --8<-- [start:search_result_as_pandas_flatten_1]
tbl.search(np.random.randn(1536)).to_pandas(flatten=1)
# --8<-- [end:search_result_as_pandas_flatten_1]
# --8<-- [start:search_result_as_list]
tbl.search(np.random.randn(1536)).to_list()
# --8<-- [end:search_result_as_list]
# --8<-- [start:search_result_as_pydantic]
tbl.search(np.random.randn(1536)).to_pydantic(LanceSchema)
# --8<-- [end:search_result_as_pydantic]
@pytest.mark.asyncio
async def test_vector_search_async():
# --8<-- [start:exhaustive_search_async]
uri = "data/sample-lancedb"
async_db = await lancedb.connect_async(uri)
data = [
{"vector": row, "item": f"item {i}"}
for i, row in enumerate(np.random.random((10_000, 1536)).astype("float32"))
]
async_tbl = await async_db.create_table("vector_search_async", data=data)
(await async_tbl.query().nearest_to(np.random.random((1536))).limit(10).to_list())
# --8<-- [end:exhaustive_search_async]
# --8<-- [start:exhaustive_search_async_cosine]
(
await async_tbl.query()
.nearest_to(np.random.random((1536)))
.distance_type("cosine")
.limit(10)
.to_list()
)
# --8<-- [end:exhaustive_search_async_cosine]
# --8<-- [start:create_table_async_with_nested_schema]
# Let's add 100 sample rows to our dataset
data = [
LanceSchema(
id=f"id{i}",
vector=np.random.randn(1536),
payload=Document(
content=f"document{i}",
meta=Metadata(source=f"source{i % 10}", timestamp=datetime.now()),
),
)
for i in range(100)
]
async_tbl = await async_db.create_table("documents_async", data=data)
# --8<-- [end:create_table_async_with_nested_schema]
# --8<-- [start:search_result_async_as_pyarrow]
await async_tbl.query().nearest_to(np.random.randn(1536)).to_arrow()
# --8<-- [end:search_result_async_as_pyarrow]
# --8<-- [start:search_result_async_as_pandas]
await async_tbl.query().nearest_to(np.random.randn(1536)).to_pandas()
# --8<-- [end:search_result_async_as_pandas]
# --8<-- [start:search_result_async_as_list]
await async_tbl.query().nearest_to(np.random.randn(1536)).to_list()
# --8<-- [end:search_result_async_as_list]
def test_fts_native():
# --8<-- [start:basic_fts]
uri = "data/sample-lancedb"
db = lancedb.connect(uri)
table = db.create_table(
"my_table_fts",
data=[
{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"},
{"vector": [5.9, 26.5], "text": "There are several kittens playing"},
],
)
# passing `use_tantivy=False` to use lance FTS index
# `use_tantivy=True` by default
table.create_fts_index("text", use_tantivy=False)
table.search("puppy").limit(10).select(["text"]).to_list()
# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}]
# ...
# --8<-- [end:basic_fts]
# --8<-- [start:fts_config_stem]
table.create_fts_index("text", tokenizer_name="en_stem", replace=True)
# --8<-- [end:fts_config_stem]
# --8<-- [start:fts_config_folding]
table.create_fts_index(
"text",
use_tantivy=False,
language="French",
stem=True,
ascii_folding=True,
replace=True,
)
# --8<-- [end:fts_config_folding]
# --8<-- [start:fts_prefiltering]
table.search("puppy").limit(10).where("text='foo'", prefilter=True).to_list()
# --8<-- [end:fts_prefiltering]
# --8<-- [start:fts_postfiltering]
table.search("puppy").limit(10).where("text='foo'", prefilter=False).to_list()
# --8<-- [end:fts_postfiltering]
# --8<-- [start:fts_with_position]
table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
# --8<-- [end:fts_with_position]
# --8<-- [start:fts_incremental_index]
table.add([{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"}])
table.optimize()
# --8<-- [end:fts_incremental_index]
@pytest.mark.asyncio
async def test_fts_native_async():
# --8<-- [start:basic_fts_async]
uri = "data/sample-lancedb"
async_db = await lancedb.connect_async(uri)
async_tbl = await async_db.create_table(
"my_table_fts_async",
data=[
{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"},
{"vector": [5.9, 26.5], "text": "There are several kittens playing"},
],
)
# async API uses our native FTS algorithm
await async_tbl.create_index("text", config=FTS())
await (
async_tbl.query().nearest_to_text("puppy").select(["text"]).limit(10).to_list()
)
# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}]
# ...
# --8<-- [end:basic_fts_async]
# --8<-- [start:fts_config_stem_async]
await async_tbl.create_index(
"text", config=FTS(language="English", stem=True, remove_stop_words=True)
) # --8<-- [end:fts_config_stem_async]
# --8<-- [start:fts_config_folding_async]
await async_tbl.create_index(
"text", config=FTS(language="French", stem=True, ascii_folding=True)
)
# --8<-- [end:fts_config_folding_async]
# --8<-- [start:fts_prefiltering_async]
await (
async_tbl.query()
.nearest_to_text("puppy")
.limit(10)
.where("text='foo'")
.to_list()
)
# --8<-- [end:fts_prefiltering_async]
# --8<-- [start:fts_postfiltering_async]
await (
async_tbl.query()
.nearest_to_text("puppy")
.limit(10)
.where("text='foo'")
.postfilter()
.to_list()
)
# --8<-- [end:fts_postfiltering_async]
# --8<-- [start:fts_with_position_async]
await async_tbl.create_index("text", config=FTS(with_position=True))
# --8<-- [end:fts_with_position_async]
# --8<-- [start:fts_incremental_index_async]
await async_tbl.add([{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"}])
await async_tbl.optimize()
# --8<-- [end:fts_incremental_index_async]
@pytest.mark.skip()
def test_hybrid_search():
# --8<-- [start:import-openai]
import openai
# --8<-- [end:import-openai]
# --8<-- [start:openai-embeddings]
# Ingest embedding function in LanceDB table
# Configuring the environment variable OPENAI_API_KEY
if "OPENAI_API_KEY" not in os.environ:
# OR set the key here as a variable
openai.api_key = "sk-..."
embeddings = get_registry().get("openai").create()
# --8<-- [end:openai-embeddings]
# --8<-- [start:class-Documents]
class Documents(LanceModel):
vector: Vector(embeddings.ndims()) = embeddings.VectorField()
text: str = embeddings.SourceField()
# --8<-- [end:class-Documents]
# --8<-- [start:basic_hybrid_search]
data = [
{"text": "rebel spaceships striking from a hidden base"},
{"text": "have won their first victory against the evil Galactic Empire"},
{"text": "during the battle rebel spies managed to steal secret plans"},
{"text": "to the Empire's ultimate weapon the Death Star"},
]
uri = "data/sample-lancedb"
db = lancedb.connect(uri)
table = db.create_table("documents", schema=Documents)
# ingest docs with auto-vectorization
table.add(data)
# Create a fts index before the hybrid search
table.create_fts_index("text")
# hybrid search with default re-ranker
table.search("flower moon", query_type="hybrid").to_pandas()
# --8<-- [end:basic_hybrid_search]
# --8<-- [start:hybrid_search_pass_vector_text]
vector_query = [0.1, 0.2, 0.3, 0.4, 0.5]
text_query = "flower moon"
(
table.search(query_type="hybrid")
.vector(vector_query)
.text(text_query)
.limit(5)
.to_pandas()
)
# --8<-- [end:hybrid_search_pass_vector_text]
@pytest.mark.skip
async def test_hybrid_search_async():
import openai
# --8<-- [start:openai-embeddings]
# Ingest embedding function in LanceDB table
# Configuring the environment variable OPENAI_API_KEY
if "OPENAI_API_KEY" not in os.environ:
# OR set the key here as a variable
openai.api_key = "sk-..."
embeddings = get_registry().get("openai").create()
# --8<-- [end:openai-embeddings]
# --8<-- [start:class-Documents]
class Documents(LanceModel):
vector: Vector(embeddings.ndims()) = embeddings.VectorField()
text: str = embeddings.SourceField()
# --8<-- [end:class-Documents]
# --8<-- [start:basic_hybrid_search_async]
uri = "data/sample-lancedb"
async_db = await lancedb.connect_async(uri)
data = [
{"text": "rebel spaceships striking from a hidden base"},
{"text": "have won their first victory against the evil Galactic Empire"},
{"text": "during the battle rebel spies managed to steal secret plans"},
{"text": "to the Empire's ultimate weapon the Death Star"},
]
async_tbl = await async_db.create_table("documents_async", schema=Documents)
# ingest docs with auto-vectorization
await async_tbl.add(data)
# Create a fts index before the hybrid search
await async_tbl.create_index("text", config=FTS())
text_query = "flower moon"
vector_query = embeddings.compute_query_embeddings(text_query)[0]
# hybrid search with default re-ranker
await (
async_tbl.query()
.nearest_to(vector_query)
.nearest_to_text(text_query)
.to_pandas()
)
# --8<-- [end:basic_hybrid_search_async]
# --8<-- [start:hybrid_search_pass_vector_text_async]
vector_query = [0.1, 0.2, 0.3, 0.4, 0.5]
text_query = "flower moon"
await (
async_tbl.query()
.nearest_to(vector_query)
.nearest_to_text(text_query)
.limit(5)
.to_pandas()
)
# --8<-- [end:hybrid_search_pass_vector_text_async]