mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-09 13:22:58 +00:00
docs: add async examples to doc (#1941)
- added sync and async tabs for python examples - moved python code to tests/docs --------- Co-authored-by: Will Jones <willjones127@gmail.com>
This commit is contained in:
@@ -125,7 +125,7 @@ async def test_quickstart_async():
|
||||
|
||||
# --8<-- [start:create_table_async]
|
||||
# Asynchronous client
|
||||
async_tbl = await async_db.create_table("my_table2", data=data)
|
||||
async_tbl = await async_db.create_table("my_table_async", data=data)
|
||||
# --8<-- [end:create_table_async]
|
||||
|
||||
df = pd.DataFrame(
|
||||
@@ -137,17 +137,17 @@ async def test_quickstart_async():
|
||||
|
||||
# --8<-- [start:create_table_async_pandas]
|
||||
# Asynchronous client
|
||||
async_tbl = await async_db.create_table("table_from_df2", df)
|
||||
async_tbl = await async_db.create_table("table_from_df_async", df)
|
||||
# --8<-- [end:create_table_async_pandas]
|
||||
|
||||
schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), list_size=2))])
|
||||
# --8<-- [start:create_empty_table_async]
|
||||
# Asynchronous client
|
||||
async_tbl = await async_db.create_table("empty_table2", schema=schema)
|
||||
async_tbl = await async_db.create_table("empty_table_async", schema=schema)
|
||||
# --8<-- [end:create_empty_table_async]
|
||||
# --8<-- [start:open_table_async]
|
||||
# Asynchronous client
|
||||
async_tbl = await async_db.open_table("my_table2")
|
||||
async_tbl = await async_db.open_table("my_table_async")
|
||||
# --8<-- [end:open_table_async]
|
||||
# --8<-- [start:table_names_async]
|
||||
# Asynchronous client
|
||||
@@ -161,6 +161,22 @@ async def test_quickstart_async():
|
||||
data = [{"vector": [x, x], "item": "filler", "price": x * x} for x in range(1000)]
|
||||
await async_tbl.add(data)
|
||||
# --8<-- [start:vector_search_async]
|
||||
# --8<-- [start:add_columns_async]
|
||||
await async_tbl.add_columns({"double_price": "cast((price * 2) as float)"})
|
||||
# --8<-- [end:add_columns_async]
|
||||
# --8<-- [start:alter_columns_async]
|
||||
await async_tbl.alter_columns(
|
||||
{
|
||||
"path": "double_price",
|
||||
"rename": "dbl_price",
|
||||
"data_type": pa.float64(),
|
||||
"nullable": True,
|
||||
}
|
||||
)
|
||||
# --8<-- [end:alter_columns_async]
|
||||
# --8<-- [start:drop_columns_async]
|
||||
await async_tbl.drop_columns(["dbl_price"])
|
||||
# --8<-- [end:drop_columns_async]
|
||||
# Asynchronous client
|
||||
await async_tbl.vector_search([100, 100]).limit(2).to_pandas()
|
||||
# --8<-- [end:vector_search_async]
|
||||
@@ -174,5 +190,5 @@ async def test_quickstart_async():
|
||||
# --8<-- [end:delete_rows_async]
|
||||
# --8<-- [start:drop_table_async]
|
||||
# Asynchronous client
|
||||
await async_db.drop_table("my_table2")
|
||||
await async_db.drop_table("my_table_async")
|
||||
# --8<-- [end:drop_table_async]
|
||||
|
||||
169
python/python/tests/docs/test_guide_index.py
Normal file
169
python/python/tests/docs/test_guide_index.py
Normal file
@@ -0,0 +1,169 @@
|
||||
# --8<-- [start:import-lancedb]
|
||||
import lancedb
|
||||
|
||||
# --8<-- [end:import-lancedb]
|
||||
# --8<-- [start:import-lancedb-ivfpq]
|
||||
from lancedb.index import IvfPq
|
||||
|
||||
# --8<-- [end:import-lancedb-ivfpq]
|
||||
# --8<-- [start:import-lancedb-btree-bitmap]
|
||||
from lancedb.index import BTree, Bitmap
|
||||
|
||||
# --8<-- [end:import-lancedb-btree-bitmap]
|
||||
# --8<-- [start:import-numpy]
|
||||
import numpy as np
|
||||
|
||||
# --8<-- [end:import-numpy]
|
||||
import pytest
|
||||
|
||||
|
||||
def test_ann_index():
|
||||
# --8<-- [start:create_ann_index]
|
||||
uri = "data/sample-lancedb"
|
||||
|
||||
# Create 5,000 sample vectors
|
||||
data = [
|
||||
{"vector": row, "item": f"item {i}"}
|
||||
for i, row in enumerate(np.random.random((5_000, 32)).astype("float32"))
|
||||
]
|
||||
|
||||
db = lancedb.connect(uri)
|
||||
# Add the vectors to a table
|
||||
tbl = db.create_table("my_vectors", data=data)
|
||||
# Create and train the index - you need to have enough data in the table
|
||||
# for an effective training step
|
||||
tbl.create_index(num_partitions=2, num_sub_vectors=4)
|
||||
# --8<-- [end:create_ann_index]
|
||||
# --8<-- [start:vector_search]
|
||||
tbl.search(np.random.random((32))).limit(2).nprobes(20).refine_factor(
|
||||
10
|
||||
).to_pandas()
|
||||
# --8<-- [end:vector_search]
|
||||
# --8<-- [start:vector_search_with_filter]
|
||||
tbl.search(np.random.random((32))).where("item != 'item 1141'").to_pandas()
|
||||
# --8<-- [end:vector_search_with_filter]
|
||||
# --8<-- [start:vector_search_with_select]
|
||||
tbl.search(np.random.random((32))).select(["vector"]).to_pandas()
|
||||
# --8<-- [end:vector_search_with_select]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ann_index_async():
|
||||
# --8<-- [start:create_ann_index_async]
|
||||
uri = "data/sample-lancedb"
|
||||
|
||||
# Create 5,000 sample vectors
|
||||
data = [
|
||||
{"vector": row, "item": f"item {i}"}
|
||||
for i, row in enumerate(np.random.random((5_000, 32)).astype("float32"))
|
||||
]
|
||||
|
||||
async_db = await lancedb.connect_async(uri)
|
||||
# Add the vectors to a table
|
||||
async_tbl = await async_db.create_table("my_vectors_async", data=data)
|
||||
# Create and train the index - you need to have enough data in the table
|
||||
# for an effective training step
|
||||
await async_tbl.create_index(
|
||||
"vector", config=IvfPq(num_partitions=2, num_sub_vectors=4)
|
||||
)
|
||||
# --8<-- [end:create_ann_index_async]
|
||||
# --8<-- [start:vector_search_async]
|
||||
await (
|
||||
async_tbl.query()
|
||||
.nearest_to(np.random.random((32)))
|
||||
.limit(2)
|
||||
.nprobes(20)
|
||||
.refine_factor(10)
|
||||
.to_pandas()
|
||||
)
|
||||
# --8<-- [end:vector_search_async]
|
||||
# --8<-- [start:vector_search_async_with_filter]
|
||||
await (
|
||||
async_tbl.query()
|
||||
.nearest_to(np.random.random((32)))
|
||||
.where("item != 'item 1141'")
|
||||
.to_pandas()
|
||||
)
|
||||
# --8<-- [end:vector_search_async_with_filter]
|
||||
# --8<-- [start:vector_search_async_with_select]
|
||||
await (
|
||||
async_tbl.query()
|
||||
.nearest_to(np.random.random((32)))
|
||||
.select(["vector"])
|
||||
.to_pandas()
|
||||
)
|
||||
# --8<-- [end:vector_search_async_with_select]
|
||||
|
||||
|
||||
def test_scalar_index():
|
||||
# --8<-- [start:basic_scalar_index]
|
||||
uri = "data/sample-lancedb"
|
||||
db = lancedb.connect(uri)
|
||||
books = [
|
||||
{
|
||||
"book_id": 1,
|
||||
"publisher": "plenty of books",
|
||||
"tags": ["fantasy", "adventure"],
|
||||
},
|
||||
{"book_id": 2, "publisher": "book town", "tags": ["non-fiction"]},
|
||||
{"book_id": 3, "publisher": "oreilly", "tags": ["textbook"]},
|
||||
]
|
||||
table = db.create_table("books", books)
|
||||
table.create_scalar_index("book_id") # BTree by default
|
||||
table.create_scalar_index("publisher", index_type="BITMAP")
|
||||
# --8<-- [end:basic_scalar_index]
|
||||
# --8<-- [start:search_with_scalar_index]
|
||||
table = db.open_table("books")
|
||||
table.search().where("book_id = 2").to_pandas()
|
||||
# --8<-- [end:search_with_scalar_index]
|
||||
# --8<-- [start:vector_search_with_scalar_index]
|
||||
data = [
|
||||
{"book_id": 1, "vector": [1, 2]},
|
||||
{"book_id": 2, "vector": [3, 4]},
|
||||
{"book_id": 3, "vector": [5, 6]},
|
||||
]
|
||||
|
||||
table = db.create_table("book_with_embeddings", data)
|
||||
(table.search([1, 2]).where("book_id != 3", prefilter=True).to_pandas())
|
||||
# --8<-- [end:vector_search_with_scalar_index]
|
||||
# --8<-- [start:update_scalar_index]
|
||||
table.add([{"vector": [7, 8], "book_id": 4}])
|
||||
table.optimize()
|
||||
# --8<-- [end:update_scalar_index]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scalar_index_async():
|
||||
# --8<-- [start:basic_scalar_index_async]
|
||||
uri = "data/sample-lancedb"
|
||||
async_db = await lancedb.connect_async(uri)
|
||||
books = [
|
||||
{
|
||||
"book_id": 1,
|
||||
"publisher": "plenty of books",
|
||||
"tags": ["fantasy", "adventure"],
|
||||
},
|
||||
{"book_id": 2, "publisher": "book town", "tags": ["non-fiction"]},
|
||||
{"book_id": 3, "publisher": "oreilly", "tags": ["textbook"]},
|
||||
]
|
||||
async_tbl = await async_db.create_table("books_async", books)
|
||||
await async_tbl.create_index("book_id", config=BTree()) # BTree by default
|
||||
await async_tbl.create_index("publisher", config=Bitmap())
|
||||
# --8<-- [end:basic_scalar_index_async]
|
||||
# --8<-- [start:search_with_scalar_index_async]
|
||||
async_tbl = await async_db.open_table("books_async")
|
||||
await async_tbl.query().where("book_id = 2").to_pandas()
|
||||
# --8<-- [end:search_with_scalar_index_async]
|
||||
# --8<-- [start:vector_search_with_scalar_index_async]
|
||||
data = [
|
||||
{"book_id": 1, "vector": [1, 2]},
|
||||
{"book_id": 2, "vector": [3, 4]},
|
||||
{"book_id": 3, "vector": [5, 6]},
|
||||
]
|
||||
async_tbl = await async_db.create_table("book_with_embeddings_async", data)
|
||||
(await async_tbl.query().where("book_id != 3").nearest_to([1, 2]).to_pandas())
|
||||
# --8<-- [end:vector_search_with_scalar_index_async]
|
||||
# --8<-- [start:update_scalar_index_async]
|
||||
await async_tbl.add([{"vector": [7, 8], "book_id": 4}])
|
||||
await async_tbl.optimize()
|
||||
# --8<-- [end:update_scalar_index_async]
|
||||
576
python/python/tests/docs/test_guide_tables.py
Normal file
576
python/python/tests/docs/test_guide_tables.py
Normal file
@@ -0,0 +1,576 @@
|
||||
# --8<-- [start:import-lancedb]
|
||||
import lancedb
|
||||
|
||||
# --8<-- [end:import-lancedb]
|
||||
# --8<-- [start:import-pandas]
|
||||
import pandas as pd
|
||||
|
||||
# --8<-- [end:import-pandas]
|
||||
# --8<-- [start:import-pyarrow]
|
||||
import pyarrow as pa
|
||||
|
||||
# --8<-- [end:import-pyarrow]
|
||||
# --8<-- [start:import-polars]
|
||||
import polars as pl
|
||||
|
||||
# --8<-- [end:import-polars]
|
||||
# --8<-- [start:import-numpy]
|
||||
import numpy as np
|
||||
|
||||
# --8<-- [end:import-numpy]
|
||||
# --8<-- [start:import-lancedb-pydantic]
|
||||
from lancedb.pydantic import Vector, LanceModel
|
||||
|
||||
# --8<-- [end:import-lancedb-pydantic]
|
||||
# --8<-- [start:import-datetime]
|
||||
from datetime import timedelta
|
||||
|
||||
# --8<-- [end:import-datetime]
|
||||
# --8<-- [start:import-embeddings]
|
||||
from lancedb.embeddings import get_registry
|
||||
|
||||
# --8<-- [end:import-embeddings]
|
||||
# --8<-- [start:import-pydantic-basemodel]
|
||||
from pydantic import BaseModel
|
||||
|
||||
# --8<-- [end:import-pydantic-basemodel]
|
||||
import pytest
|
||||
|
||||
|
||||
# --8<-- [start:class-Content]
|
||||
class Content(LanceModel):
|
||||
movie_id: int
|
||||
vector: Vector(128)
|
||||
genres: str
|
||||
title: str
|
||||
imdb_id: int
|
||||
|
||||
@property
|
||||
def imdb_url(self) -> str:
|
||||
return f"https://www.imdb.com/title/tt{self.imdb_id}"
|
||||
|
||||
|
||||
# --8<-- [end:class-Content]
|
||||
# --8<-- [start:class-Document]
|
||||
class Document(BaseModel):
|
||||
content: str
|
||||
source: str
|
||||
|
||||
|
||||
# --8<-- [end:class-Document]
|
||||
# --8<-- [start:class-NestedSchema]
|
||||
class NestedSchema(LanceModel):
|
||||
id: str
|
||||
vector: Vector(1536)
|
||||
document: Document
|
||||
|
||||
|
||||
# --8<-- [end:class-NestedSchema]
|
||||
# --8<-- [start:class-Item]
|
||||
class Item(LanceModel):
|
||||
vector: Vector(2)
|
||||
item: str
|
||||
price: float
|
||||
|
||||
|
||||
# --8<-- [end:class-Item]
|
||||
|
||||
|
||||
# --8<-- [start:make_batches]
|
||||
def make_batches():
|
||||
for i in range(5):
|
||||
yield pa.RecordBatch.from_arrays(
|
||||
[
|
||||
pa.array(
|
||||
[[3.1, 4.1, 5.1, 6.1], [5.9, 26.5, 4.7, 32.8]],
|
||||
pa.list_(pa.float32(), 4),
|
||||
),
|
||||
pa.array(["foo", "bar"]),
|
||||
pa.array([10.0, 20.0]),
|
||||
],
|
||||
["vector", "item", "price"],
|
||||
)
|
||||
|
||||
|
||||
# --8<-- [end:make_batches]
|
||||
|
||||
|
||||
# --8<-- [start:make_batches_for_add]
|
||||
def make_batches_for_add():
|
||||
for i in range(5):
|
||||
yield [
|
||||
{"vector": [3.1, 4.1], "item": "peach", "price": 6.0},
|
||||
{"vector": [5.9, 26.5], "item": "pear", "price": 5.0},
|
||||
]
|
||||
|
||||
|
||||
# --8<-- [end:make_batches_for_add]
|
||||
|
||||
|
||||
def test_table():
|
||||
# --8<-- [start:connect]
|
||||
uri = "data/sample-lancedb"
|
||||
db = lancedb.connect(uri)
|
||||
# --8<-- [end:connect]
|
||||
# --8<-- [start:create_table]
|
||||
data = [
|
||||
{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
|
||||
{"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1},
|
||||
]
|
||||
db.create_table("test_table", data)
|
||||
db["test_table"].head()
|
||||
# --8<-- [end:create_table]
|
||||
# --8<-- [start:create_table_exist_ok]
|
||||
db.create_table("test_table", data, exist_ok=True)
|
||||
# --8<-- [end:create_table_exist_ok]
|
||||
# --8<-- [start:create_table_overwrite]
|
||||
db.create_table("test_table", data, mode="overwrite")
|
||||
# --8<-- [end:create_table_overwrite]
|
||||
# --8<-- [start:create_table_from_pandas]
|
||||
data = pd.DataFrame(
|
||||
{
|
||||
"vector": [[1.1, 1.2, 1.3, 1.4], [0.2, 1.8, 0.4, 3.6]],
|
||||
"lat": [45.5, 40.1],
|
||||
"long": [-122.7, -74.1],
|
||||
}
|
||||
)
|
||||
db.create_table("my_table_pandas", data)
|
||||
db["my_table_pandas"].head()
|
||||
# --8<-- [end:create_table_from_pandas]
|
||||
# --8<-- [start:create_table_custom_schema]
|
||||
custom_schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 4)),
|
||||
pa.field("lat", pa.float32()),
|
||||
pa.field("long", pa.float32()),
|
||||
]
|
||||
)
|
||||
|
||||
tbl = db.create_table("my_table_custom_schema", data, schema=custom_schema)
|
||||
# --8<-- [end:create_table_custom_schema]
|
||||
# --8<-- [start:create_table_from_polars]
|
||||
data = pl.DataFrame(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0],
|
||||
}
|
||||
)
|
||||
tbl = db.create_table("my_table_pl", data)
|
||||
# --8<-- [end:create_table_from_polars]
|
||||
# --8<-- [start:create_table_from_arrow_table]
|
||||
dim = 16
|
||||
total = 2
|
||||
schema = pa.schema(
|
||||
[pa.field("vector", pa.list_(pa.float16(), dim)), pa.field("text", pa.string())]
|
||||
)
|
||||
data = pa.Table.from_arrays(
|
||||
[
|
||||
pa.array(
|
||||
[np.random.randn(dim).astype(np.float16) for _ in range(total)],
|
||||
pa.list_(pa.float16(), dim),
|
||||
),
|
||||
pa.array(["foo", "bar"]),
|
||||
],
|
||||
["vector", "text"],
|
||||
)
|
||||
tbl = db.create_table("f16_tbl", data, schema=schema)
|
||||
# --8<-- [end:create_table_from_arrow_table]
|
||||
# --8<-- [start:create_table_from_pydantic]
|
||||
tbl = db.create_table("movielens_small", schema=Content)
|
||||
# --8<-- [end:create_table_from_pydantic]
|
||||
# --8<-- [start:create_table_nested_schema]
|
||||
tbl = db.create_table("nested_table", schema=NestedSchema)
|
||||
# --8<-- [end:create_table_nested_schema]
|
||||
# --8<-- [start:create_table_from_batch]
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 4)),
|
||||
pa.field("item", pa.utf8()),
|
||||
pa.field("price", pa.float32()),
|
||||
]
|
||||
)
|
||||
db.create_table("batched_tale", make_batches(), schema=schema)
|
||||
# --8<-- [end:create_table_from_batch]
|
||||
# --8<-- [start:list_tables]
|
||||
print(db.table_names())
|
||||
# --8<-- [end:list_tables]
|
||||
# --8<-- [start:open_table]
|
||||
tbl = db.open_table("test_table")
|
||||
# --8<-- [end:open_table]
|
||||
# --8<-- [start:create_empty_table]
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
pa.field("item", pa.string()),
|
||||
pa.field("price", pa.float32()),
|
||||
]
|
||||
)
|
||||
tbl = db.create_table("test_empty_table", schema=schema)
|
||||
# --8<-- [end:create_empty_table]
|
||||
# --8<-- [start:create_empty_table_pydantic]
|
||||
tbl = db.create_table("test_empty_table_new", schema=Item.to_arrow_schema())
|
||||
# --8<-- [end:create_empty_table_pydantic]
|
||||
# --8<-- [start:add_table_from_pandas]
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"vector": [[1.3, 1.4], [9.5, 56.2]],
|
||||
"item": ["banana", "apple"],
|
||||
"price": [5.0, 7.0],
|
||||
}
|
||||
)
|
||||
|
||||
tbl.add(df)
|
||||
# --8<-- [end:add_table_from_pandas]
|
||||
# --8<-- [start:add_table_from_polars]
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"vector": [[1.3, 1.4], [9.5, 56.2]],
|
||||
"item": ["banana", "apple"],
|
||||
"price": [5.0, 7.0],
|
||||
}
|
||||
)
|
||||
|
||||
tbl.add(df)
|
||||
# --8<-- [end:add_table_from_polars]
|
||||
# --8<-- [start:add_table_from_batch]
|
||||
tbl.add(make_batches_for_add())
|
||||
# --8<-- [end:add_table_from_batch]
|
||||
# --8<-- [start:add_table_from_pyarrow]
|
||||
pa_table = pa.Table.from_arrays(
|
||||
[
|
||||
pa.array([[9.1, 6.7], [9.9, 31.2]], pa.list_(pa.float32(), 2)),
|
||||
pa.array(["mango", "orange"]),
|
||||
pa.array([7.0, 4.0]),
|
||||
],
|
||||
["vector", "item", "price"],
|
||||
)
|
||||
tbl.add(pa_table)
|
||||
# --8<-- [end:add_table_from_pyarrow]
|
||||
# --8<-- [start:add_table_from_pydantic]
|
||||
pydantic_model_items = [
|
||||
Item(vector=[8.1, 4.7], item="pineapple", price=10.0),
|
||||
Item(vector=[6.9, 9.3], item="avocado", price=9.0),
|
||||
]
|
||||
tbl.add(pydantic_model_items)
|
||||
# --8<-- [end:add_table_from_pydantic]
|
||||
# --8<-- [start:delete_row]
|
||||
tbl.delete('item = "fizz"')
|
||||
# --8<-- [end:delete_row]
|
||||
# --8<-- [start:delete_specific_row]
|
||||
data = [
|
||||
{"x": 1, "vector": [1, 2]},
|
||||
{"x": 2, "vector": [3, 4]},
|
||||
{"x": 3, "vector": [5, 6]},
|
||||
]
|
||||
# Synchronous client
|
||||
tbl = db.create_table("delete_row", data)
|
||||
tbl.to_pandas()
|
||||
# x vector
|
||||
# 0 1 [1.0, 2.0]
|
||||
# 1 2 [3.0, 4.0]
|
||||
# 2 3 [5.0, 6.0]
|
||||
|
||||
tbl.delete("x = 2")
|
||||
tbl.to_pandas()
|
||||
# x vector
|
||||
# 0 1 [1.0, 2.0]
|
||||
# 1 3 [5.0, 6.0]
|
||||
# --8<-- [end:delete_specific_row]
|
||||
# --8<-- [start:delete_list_values]
|
||||
to_remove = [1, 5]
|
||||
to_remove = ", ".join(str(v) for v in to_remove)
|
||||
|
||||
tbl.delete(f"x IN ({to_remove})")
|
||||
tbl.to_pandas()
|
||||
# x vector
|
||||
# 0 3 [5.0, 6.0]
|
||||
# --8<-- [end:delete_list_values]
|
||||
# --8<-- [start:update_table]
|
||||
# Create a table from a pandas DataFrame
|
||||
data = pd.DataFrame({"x": [1, 2, 3], "vector": [[1, 2], [3, 4], [5, 6]]})
|
||||
|
||||
tbl = db.create_table("test_table", data, mode="overwrite")
|
||||
# Update the table where x = 2
|
||||
tbl.update(where="x = 2", values={"vector": [10, 10]})
|
||||
# Get the updated table as a pandas DataFrame
|
||||
df = tbl.to_pandas()
|
||||
print(df)
|
||||
# --8<-- [end:update_table]
|
||||
# --8<-- [start:update_table_sql]
|
||||
# Update the table where x = 2
|
||||
tbl.update(values_sql={"x": "x + 1"})
|
||||
print(tbl.to_pandas())
|
||||
# --8<-- [end:update_table_sql]
|
||||
# --8<-- [start:table_strong_consistency]
|
||||
uri = "data/sample-lancedb"
|
||||
db = lancedb.connect(uri, read_consistency_interval=timedelta(0))
|
||||
tbl = db.open_table("test_table")
|
||||
# --8<-- [end:table_strong_consistency]
|
||||
# --8<-- [start:table_eventual_consistency]
|
||||
uri = "data/sample-lancedb"
|
||||
db = lancedb.connect(uri, read_consistency_interval=timedelta(seconds=5))
|
||||
tbl = db.open_table("test_table")
|
||||
# --8<-- [end:table_eventual_consistency]
|
||||
# --8<-- [start:table_checkout_latest]
|
||||
tbl = db.open_table("test_table")
|
||||
|
||||
# (Other writes happen to my_table from another process)
|
||||
|
||||
# Check for updates
|
||||
tbl.checkout_latest()
|
||||
# --8<-- [end:table_checkout_latest]
|
||||
|
||||
|
||||
@pytest.mark.skip
|
||||
def test_table_with_embedding():
|
||||
db = lancedb.connect("data/sample-lancedb")
|
||||
# --8<-- [start:create_table_with_embedding]
|
||||
embed_fcn = get_registry().get("huggingface").create(name="BAAI/bge-small-en-v1.5")
|
||||
|
||||
class Schema(LanceModel):
|
||||
text: str = embed_fcn.SourceField()
|
||||
vector: Vector(embed_fcn.ndims()) = embed_fcn.VectorField(default=None)
|
||||
|
||||
tbl = db.create_table("my_table_with_embedding", schema=Schema, mode="overwrite")
|
||||
models = [Schema(text="hello"), Schema(text="world")]
|
||||
tbl.add(models)
|
||||
# --8<-- [end:create_table_with_embedding]
|
||||
|
||||
|
||||
@pytest.mark.skip
|
||||
async def test_table_with_embedding_async():
|
||||
async_db = await lancedb.connect_async("data/sample-lancedb")
|
||||
# --8<-- [start:create_table_async_with_embedding]
|
||||
embed_fcn = get_registry().get("huggingface").create(name="BAAI/bge-small-en-v1.5")
|
||||
|
||||
class Schema(LanceModel):
|
||||
text: str = embed_fcn.SourceField()
|
||||
vector: Vector(embed_fcn.ndims()) = embed_fcn.VectorField(default=None)
|
||||
|
||||
async_tbl = await async_db.create_table(
|
||||
"my_table_async_with_embedding", schema=Schema, mode="overwrite"
|
||||
)
|
||||
models = [Schema(text="hello"), Schema(text="world")]
|
||||
await async_tbl.add(models)
|
||||
# --8<-- [end:create_table_async_with_embedding]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_table_async():
|
||||
# --8<-- [start:connect_async]
|
||||
uri = "data/sample-lancedb"
|
||||
async_db = await lancedb.connect_async(uri)
|
||||
# --8<-- [end:connect_async]
|
||||
# --8<-- [start:create_table_async]
|
||||
data = [
|
||||
{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
|
||||
{"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1},
|
||||
]
|
||||
async_tbl = await async_db.create_table("test_table_async", data)
|
||||
await async_tbl.head()
|
||||
# --8<-- [end:create_table_async]
|
||||
# --8<-- [start:create_table_async_exist_ok]
|
||||
await async_db.create_table("test_table_async", data, exist_ok=True)
|
||||
# --8<-- [end:create_table_async_exist_ok]
|
||||
# --8<-- [start:create_table_async_overwrite]
|
||||
await async_db.create_table("test_table_async", data, mode="overwrite")
|
||||
# --8<-- [end:create_table_async_overwrite]
|
||||
# --8<-- [start:create_table_async_from_pandas]
|
||||
data = pd.DataFrame(
|
||||
{
|
||||
"vector": [[1.1, 1.2, 1.3, 1.4], [0.2, 1.8, 0.4, 3.6]],
|
||||
"lat": [45.5, 40.1],
|
||||
"long": [-122.7, -74.1],
|
||||
}
|
||||
)
|
||||
async_tbl = await async_db.create_table("my_table_async_pd", data)
|
||||
await async_tbl.head()
|
||||
# --8<-- [end:create_table_async_from_pandas]
|
||||
# --8<-- [start:create_table_async_custom_schema]
|
||||
custom_schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 4)),
|
||||
pa.field("lat", pa.float32()),
|
||||
pa.field("long", pa.float32()),
|
||||
]
|
||||
)
|
||||
async_tbl = await async_db.create_table(
|
||||
"my_table_async_custom_schema", data, schema=custom_schema
|
||||
)
|
||||
# --8<-- [end:create_table_async_custom_schema]
|
||||
# --8<-- [start:create_table_async_from_polars]
|
||||
data = pl.DataFrame(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0],
|
||||
}
|
||||
)
|
||||
async_tbl = await async_db.create_table("my_table_async_pl", data)
|
||||
# --8<-- [end:create_table_async_from_polars]
|
||||
# --8<-- [start:create_table_async_from_arrow_table]
|
||||
dim = 16
|
||||
total = 2
|
||||
schema = pa.schema(
|
||||
[pa.field("vector", pa.list_(pa.float16(), dim)), pa.field("text", pa.string())]
|
||||
)
|
||||
data = pa.Table.from_arrays(
|
||||
[
|
||||
pa.array(
|
||||
[np.random.randn(dim).astype(np.float16) for _ in range(total)],
|
||||
pa.list_(pa.float16(), dim),
|
||||
),
|
||||
pa.array(["foo", "bar"]),
|
||||
],
|
||||
["vector", "text"],
|
||||
)
|
||||
async_tbl = await async_db.create_table("f16_tbl_async", data, schema=schema)
|
||||
# --8<-- [end:create_table_async_from_arrow_table]
|
||||
# --8<-- [start:create_table_async_from_pydantic]
|
||||
async_tbl = await async_db.create_table("movielens_small_async", schema=Content)
|
||||
# --8<-- [end:create_table_async_from_pydantic]
|
||||
# --8<-- [start:create_table_async_nested_schema]
|
||||
async_tbl = await async_db.create_table("nested_table_async", schema=NestedSchema)
|
||||
# --8<-- [end:create_table_async_nested_schema]
|
||||
# --8<-- [start:create_table_async_from_batch]
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 4)),
|
||||
pa.field("item", pa.utf8()),
|
||||
pa.field("price", pa.float32()),
|
||||
]
|
||||
)
|
||||
await async_db.create_table("batched_table", make_batches(), schema=schema)
|
||||
# --8<-- [end:create_table_async_from_batch]
|
||||
# --8<-- [start:list_tables_async]
|
||||
print(await async_db.table_names())
|
||||
# --8<-- [end:list_tables_async]
|
||||
# --8<-- [start:open_table_async]
|
||||
async_tbl = await async_db.open_table("test_table_async")
|
||||
# --8<-- [end:open_table_async]
|
||||
# --8<-- [start:create_empty_table_async]
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
pa.field("item", pa.string()),
|
||||
pa.field("price", pa.float32()),
|
||||
]
|
||||
)
|
||||
async_tbl = await async_db.create_table("test_empty_table_async", schema=schema)
|
||||
# --8<-- [end:create_empty_table_async]
|
||||
# --8<-- [start:create_empty_table_async_pydantic]
|
||||
async_tbl = await async_db.create_table(
|
||||
"test_empty_table_async_new", schema=Item.to_arrow_schema()
|
||||
)
|
||||
# --8<-- [end:create_empty_table_async_pydantic]
|
||||
# --8<-- [start:add_table_async_from_pandas]
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"vector": [[1.3, 1.4], [9.5, 56.2]],
|
||||
"item": ["banana", "apple"],
|
||||
"price": [5.0, 7.0],
|
||||
}
|
||||
)
|
||||
await async_tbl.add(df)
|
||||
# --8<-- [end:add_table_async_from_pandas]
|
||||
# --8<-- [start:add_table_async_from_polars]
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"vector": [[1.3, 1.4], [9.5, 56.2]],
|
||||
"item": ["banana", "apple"],
|
||||
"price": [5.0, 7.0],
|
||||
}
|
||||
)
|
||||
await async_tbl.add(df)
|
||||
# --8<-- [end:add_table_async_from_polars]
|
||||
# --8<-- [start:add_table_async_from_batch]
|
||||
await async_tbl.add(make_batches_for_add())
|
||||
# --8<-- [end:add_table_async_from_batch]
|
||||
# --8<-- [start:add_table_async_from_pyarrow]
|
||||
pa_table = pa.Table.from_arrays(
|
||||
[
|
||||
pa.array([[9.1, 6.7], [9.9, 31.2]], pa.list_(pa.float32(), 2)),
|
||||
pa.array(["mango", "orange"]),
|
||||
pa.array([7.0, 4.0]),
|
||||
],
|
||||
["vector", "item", "price"],
|
||||
)
|
||||
await async_tbl.add(pa_table)
|
||||
# --8<-- [end:add_table_async_from_pyarrow]
|
||||
# --8<-- [start:add_table_async_from_pydantic]
|
||||
pydantic_model_items = [
|
||||
Item(vector=[8.1, 4.7], item="pineapple", price=10.0),
|
||||
Item(vector=[6.9, 9.3], item="avocado", price=9.0),
|
||||
]
|
||||
await async_tbl.add(pydantic_model_items)
|
||||
# --8<-- [end:add_table_async_from_pydantic]
|
||||
# --8<-- [start:delete_row_async]
|
||||
await async_tbl.delete('item = "fizz"')
|
||||
# --8<-- [end:delete_row_async]
|
||||
# --8<-- [start:delete_specific_row_async]
|
||||
data = [
|
||||
{"x": 1, "vector": [1, 2]},
|
||||
{"x": 2, "vector": [3, 4]},
|
||||
{"x": 3, "vector": [5, 6]},
|
||||
]
|
||||
async_db = await lancedb.connect_async(uri)
|
||||
async_tbl = await async_db.create_table("delete_row_async", data)
|
||||
await async_tbl.to_pandas()
|
||||
# x vector
|
||||
# 0 1 [1.0, 2.0]
|
||||
# 1 2 [3.0, 4.0]
|
||||
# 2 3 [5.0, 6.0]
|
||||
|
||||
await async_tbl.delete("x = 2")
|
||||
await async_tbl.to_pandas()
|
||||
# x vector
|
||||
# 0 1 [1.0, 2.0]
|
||||
# 1 3 [5.0, 6.0]
|
||||
# --8<-- [end:delete_specific_row_async]
|
||||
# --8<-- [start:delete_list_values_async]
|
||||
to_remove = [1, 5]
|
||||
to_remove = ", ".join(str(v) for v in to_remove)
|
||||
|
||||
await async_tbl.delete(f"x IN ({to_remove})")
|
||||
await async_tbl.to_pandas()
|
||||
# x vector
|
||||
# 0 3 [5.0, 6.0]
|
||||
# --8<-- [end:delete_list_values_async]
|
||||
# --8<-- [start:update_table_async]
|
||||
# Create a table from a pandas DataFrame
|
||||
data = pd.DataFrame({"x": [1, 2, 3], "vector": [[1, 2], [3, 4], [5, 6]]})
|
||||
|
||||
async_tbl = await async_db.create_table("update_table_async", data)
|
||||
# Update the table where x = 2
|
||||
await async_tbl.update({"vector": [10, 10]}, where="x = 2")
|
||||
# Get the updated table as a pandas DataFrame
|
||||
df = await async_tbl.to_pandas()
|
||||
# Print the DataFrame
|
||||
print(df)
|
||||
# --8<-- [end:update_table_async]
|
||||
# --8<-- [start:update_table_sql_async]
|
||||
# Update the table where x = 2
|
||||
await async_tbl.update(updates_sql={"x": "x + 1"})
|
||||
print(await async_tbl.to_pandas())
|
||||
# --8<-- [end:update_table_sql_async]
|
||||
# --8<-- [start:table_async_strong_consistency]
|
||||
uri = "data/sample-lancedb"
|
||||
async_db = await lancedb.connect_async(uri, read_consistency_interval=timedelta(0))
|
||||
async_tbl = await async_db.open_table("test_table_async")
|
||||
# --8<-- [end:table_async_strong_consistency]
|
||||
# --8<-- [start:table_async_ventual_consistency]
|
||||
uri = "data/sample-lancedb"
|
||||
async_db = await lancedb.connect_async(
|
||||
uri, read_consistency_interval=timedelta(seconds=5)
|
||||
)
|
||||
async_tbl = await async_db.open_table("test_table_async")
|
||||
# --8<-- [end:table_async_eventual_consistency]
|
||||
# --8<-- [start:table_async_checkout_latest]
|
||||
async_tbl = await async_db.open_table("test_table_async")
|
||||
|
||||
# (Other writes happen to test_table_async from another process)
|
||||
|
||||
# Check for updates
|
||||
await async_tbl.checkout_latest()
|
||||
# --8<-- [end:table_async_checkout_latest]
|
||||
187
python/python/tests/docs/test_python.py
Normal file
187
python/python/tests/docs/test_python.py
Normal file
@@ -0,0 +1,187 @@
|
||||
# --8<-- [start:import-lancedb]
|
||||
import lancedb
|
||||
|
||||
# --8<-- [end:import-lancedb]
|
||||
# --8<-- [start:import-pandas]
|
||||
import pandas as pd
|
||||
|
||||
# --8<-- [end:import-pandas]
|
||||
# --8<-- [start:import-iterable]
|
||||
from typing import Iterable
|
||||
|
||||
# --8<-- [end:import-iterable]
|
||||
# --8<-- [start:import-pyarrow]
|
||||
import pyarrow as pa
|
||||
|
||||
# --8<-- [end:import-pyarrow]
|
||||
# --8<-- [start:import-polars]
|
||||
import polars as pl
|
||||
|
||||
# --8<-- [end:import-polars]
|
||||
# --8<-- [start:import-lancedb-pydantic]
|
||||
from lancedb.pydantic import Vector, LanceModel
|
||||
|
||||
# --8<-- [end:import-lancedb-pydantic]
|
||||
import pytest
|
||||
|
||||
|
||||
# --8<-- [start:make_batches]
|
||||
def make_batches() -> Iterable[pa.RecordBatch]:
|
||||
for i in range(5):
|
||||
yield pa.RecordBatch.from_arrays(
|
||||
[
|
||||
pa.array([[3.1, 4.1], [5.9, 26.5]]),
|
||||
pa.array(["foo", "bar"]),
|
||||
pa.array([10.0, 20.0]),
|
||||
],
|
||||
["vector", "item", "price"],
|
||||
)
|
||||
|
||||
|
||||
# --8<-- [end:make_batches]
|
||||
|
||||
|
||||
def test_pandas_and_pyarrow():
|
||||
# --8<-- [start:connect_to_lancedb]
|
||||
uri = "data/sample-lancedb"
|
||||
db = lancedb.connect(uri)
|
||||
# --8<-- [end:connect_to_lancedb]
|
||||
# --8<-- [start:create_table_pandas]
|
||||
data = pd.DataFrame(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0],
|
||||
}
|
||||
)
|
||||
table = db.create_table("pd_table", data=data)
|
||||
# --8<-- [end:create_table_pandas]
|
||||
# --8<-- [start:create_table_iterable]
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32())),
|
||||
pa.field("item", pa.utf8()),
|
||||
pa.field("price", pa.float32()),
|
||||
]
|
||||
)
|
||||
table = db.create_table("iterable_table", data=make_batches(), schema=schema)
|
||||
# --8<-- [end:create_table_iterable]
|
||||
# --8<-- [start:vector_search]
|
||||
# Open the table previously created.
|
||||
table = db.open_table("pd_table")
|
||||
|
||||
query_vector = [100, 100]
|
||||
# Pandas DataFrame
|
||||
df = table.search(query_vector).limit(1).to_pandas()
|
||||
print(df)
|
||||
# --8<-- [end:vector_search]
|
||||
# --8<-- [start:vector_search_with_filter]
|
||||
# Apply the filter via LanceDB
|
||||
results = table.search([100, 100]).where("price < 15").to_pandas()
|
||||
assert len(results) == 1
|
||||
assert results["item"].iloc[0] == "foo"
|
||||
|
||||
# Apply the filter via Pandas
|
||||
df = results = table.search([100, 100]).to_pandas()
|
||||
results = df[df.price < 15]
|
||||
assert len(results) == 1
|
||||
assert results["item"].iloc[0] == "foo"
|
||||
# --8<-- [end:vector_search_with_filter]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_pandas_and_pyarrow_async():
|
||||
# --8<-- [start:connect_to_lancedb_async]
|
||||
uri = "data/sample-lancedb"
|
||||
async_db = await lancedb.connect_async(uri)
|
||||
# --8<-- [end:connect_to_lancedb_async]
|
||||
# --8<-- [start:create_table_pandas_async]
|
||||
data = pd.DataFrame(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0],
|
||||
}
|
||||
)
|
||||
await async_db.create_table("pd_table_async", data=data)
|
||||
# --8<-- [end:create_table_pandas_async]
|
||||
# --8<-- [start:create_table_iterable_async]
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32())),
|
||||
pa.field("item", pa.utf8()),
|
||||
pa.field("price", pa.float32()),
|
||||
]
|
||||
)
|
||||
await async_db.create_table(
|
||||
"iterable_table_async", data=make_batches(), schema=schema
|
||||
)
|
||||
# --8<-- [end:create_table_iterable_async]
|
||||
# --8<-- [start:vector_search_async]
|
||||
# Open the table previously created.
|
||||
async_tbl = await async_db.open_table("pd_table_async")
|
||||
|
||||
query_vector = [100, 100]
|
||||
# Pandas DataFrame
|
||||
df = await async_tbl.query().nearest_to(query_vector).limit(1).to_pandas()
|
||||
print(df)
|
||||
# --8<-- [end:vector_search_async]
|
||||
# --8<-- [start:vector_search_with_filter_async]
|
||||
# Apply the filter via LanceDB
|
||||
results = (
|
||||
await async_tbl.query().nearest_to([100, 100]).where("price < 15").to_pandas()
|
||||
)
|
||||
assert len(results) == 1
|
||||
assert results["item"].iloc[0] == "foo"
|
||||
|
||||
# Apply the filter via Pandas
|
||||
df = results = await async_tbl.query().nearest_to([100, 100]).to_pandas()
|
||||
results = df[df.price < 15]
|
||||
assert len(results) == 1
|
||||
assert results["item"].iloc[0] == "foo"
|
||||
# --8<-- [end:vector_search_with_filter_async]
|
||||
|
||||
|
||||
# --8<-- [start:class_Item]
|
||||
class Item(LanceModel):
|
||||
vector: Vector(2)
|
||||
item: str
|
||||
price: float
|
||||
|
||||
|
||||
# --8<-- [end:class_Item]
|
||||
|
||||
|
||||
def test_polars():
|
||||
uri = "data/sample-lancedb"
|
||||
db = lancedb.connect(uri)
|
||||
|
||||
# --8<-- [start:create_table_polars]
|
||||
data = pl.DataFrame(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0],
|
||||
}
|
||||
)
|
||||
table = db.create_table("pl_table", data=data)
|
||||
# --8<-- [end:create_table_polars]
|
||||
# --8<-- [start:vector_search_polars]
|
||||
query = [3.0, 4.0]
|
||||
result = table.search(query).limit(1).to_polars()
|
||||
print(result)
|
||||
print(type(result))
|
||||
# --8<-- [end:vector_search_polars]
|
||||
# --8<-- [start:create_table_pydantic]
|
||||
table = db.create_table("pydantic_table", schema=Item)
|
||||
df = pl.DataFrame(data)
|
||||
# Add Polars DataFrame to table
|
||||
table.add(df)
|
||||
# --8<-- [end:create_table_pydantic]
|
||||
# --8<-- [start:dump_table_lazyform]
|
||||
ldf = table.to_polars()
|
||||
print(type(ldf))
|
||||
# --8<-- [end:dump_table_lazyform]
|
||||
# --8<-- [start:print_table_lazyform]
|
||||
print(ldf.first().collect())
|
||||
# --8<-- [end:print_table_lazyform]
|
||||
366
python/python/tests/docs/test_search.py
Normal file
366
python/python/tests/docs/test_search.py
Normal file
@@ -0,0 +1,366 @@
|
||||
# --8<-- [start:import-lancedb]
|
||||
import lancedb
|
||||
|
||||
# --8<-- [end:import-lancedb]
|
||||
# --8<-- [start:import-numpy]
|
||||
import numpy as np
|
||||
|
||||
# --8<-- [end:import-numpy]
|
||||
# --8<-- [start:import-datetime]
|
||||
from datetime import datetime
|
||||
|
||||
# --8<-- [end:import-datetime]
|
||||
# --8<-- [start:import-lancedb-pydantic]
|
||||
from lancedb.pydantic import Vector, LanceModel
|
||||
|
||||
# --8<-- [end:import-lancedb-pydantic]
|
||||
# --8<-- [start:import-pydantic-base-model]
|
||||
from pydantic import BaseModel
|
||||
|
||||
# --8<-- [end:import-pydantic-base-model]
|
||||
# --8<-- [start:import-lancedb-fts]
|
||||
from lancedb.index import FTS
|
||||
|
||||
# --8<-- [end:import-lancedb-fts]
|
||||
# --8<-- [start:import-os]
|
||||
import os
|
||||
|
||||
# --8<-- [end:import-os]
|
||||
# --8<-- [start:import-embeddings]
|
||||
from lancedb.embeddings import get_registry
|
||||
|
||||
# --8<-- [end:import-embeddings]
|
||||
import pytest
|
||||
|
||||
|
||||
# --8<-- [start:class-definition]
|
||||
class Metadata(BaseModel):
|
||||
source: str
|
||||
timestamp: datetime
|
||||
|
||||
|
||||
class Document(BaseModel):
|
||||
content: str
|
||||
meta: Metadata
|
||||
|
||||
|
||||
class LanceSchema(LanceModel):
|
||||
id: str
|
||||
vector: Vector(1536)
|
||||
payload: Document
|
||||
|
||||
|
||||
# --8<-- [end:class-definition]
|
||||
|
||||
|
||||
def test_vector_search():
|
||||
# --8<-- [start:exhaustive_search]
|
||||
uri = "data/sample-lancedb"
|
||||
db = lancedb.connect(uri)
|
||||
data = [
|
||||
{"vector": row, "item": f"item {i}"}
|
||||
for i, row in enumerate(np.random.random((10_000, 1536)).astype("float32"))
|
||||
]
|
||||
tbl = db.create_table("vector_search", data=data)
|
||||
tbl.search(np.random.random((1536))).limit(10).to_list()
|
||||
# --8<-- [end:exhaustive_search]
|
||||
# --8<-- [start:exhaustive_search_cosine]
|
||||
tbl.search(np.random.random((1536))).metric("cosine").limit(10).to_list()
|
||||
# --8<-- [end:exhaustive_search_cosine]
|
||||
# --8<-- [start:create_table_with_nested_schema]
|
||||
# Let's add 100 sample rows to our dataset
|
||||
data = [
|
||||
LanceSchema(
|
||||
id=f"id{i}",
|
||||
vector=np.random.randn(1536),
|
||||
payload=Document(
|
||||
content=f"document{i}",
|
||||
meta=Metadata(source=f"source{i % 10}", timestamp=datetime.now()),
|
||||
),
|
||||
)
|
||||
for i in range(100)
|
||||
]
|
||||
|
||||
# Synchronous client
|
||||
tbl = db.create_table("documents", data=data)
|
||||
# --8<-- [end:create_table_with_nested_schema]
|
||||
# --8<-- [start:search_result_as_pyarrow]
|
||||
tbl.search(np.random.randn(1536)).to_arrow()
|
||||
# --8<-- [end:search_result_as_pyarrow]
|
||||
# --8<-- [start:search_result_as_pandas]
|
||||
tbl.search(np.random.randn(1536)).to_pandas()
|
||||
# --8<-- [end:search_result_as_pandas]
|
||||
# --8<-- [start:search_result_as_pandas_flatten_true]
|
||||
tbl.search(np.random.randn(1536)).to_pandas(flatten=True)
|
||||
# --8<-- [end:search_result_as_pandas_flatten_true]
|
||||
# --8<-- [start:search_result_as_pandas_flatten_1]
|
||||
tbl.search(np.random.randn(1536)).to_pandas(flatten=1)
|
||||
# --8<-- [end:search_result_as_pandas_flatten_1]
|
||||
# --8<-- [start:search_result_as_list]
|
||||
tbl.search(np.random.randn(1536)).to_list()
|
||||
# --8<-- [end:search_result_as_list]
|
||||
# --8<-- [start:search_result_as_pydantic]
|
||||
tbl.search(np.random.randn(1536)).to_pydantic(LanceSchema)
|
||||
# --8<-- [end:search_result_as_pydantic]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vector_search_async():
|
||||
# --8<-- [start:exhaustive_search_async]
|
||||
uri = "data/sample-lancedb"
|
||||
async_db = await lancedb.connect_async(uri)
|
||||
data = [
|
||||
{"vector": row, "item": f"item {i}"}
|
||||
for i, row in enumerate(np.random.random((10_000, 1536)).astype("float32"))
|
||||
]
|
||||
async_tbl = await async_db.create_table("vector_search_async", data=data)
|
||||
(await async_tbl.query().nearest_to(np.random.random((1536))).limit(10).to_list())
|
||||
# --8<-- [end:exhaustive_search_async]
|
||||
# --8<-- [start:exhaustive_search_async_cosine]
|
||||
(
|
||||
await async_tbl.query()
|
||||
.nearest_to(np.random.random((1536)))
|
||||
.distance_type("cosine")
|
||||
.limit(10)
|
||||
.to_list()
|
||||
)
|
||||
# --8<-- [end:exhaustive_search_async_cosine]
|
||||
# --8<-- [start:create_table_async_with_nested_schema]
|
||||
# Let's add 100 sample rows to our dataset
|
||||
data = [
|
||||
LanceSchema(
|
||||
id=f"id{i}",
|
||||
vector=np.random.randn(1536),
|
||||
payload=Document(
|
||||
content=f"document{i}",
|
||||
meta=Metadata(source=f"source{i % 10}", timestamp=datetime.now()),
|
||||
),
|
||||
)
|
||||
for i in range(100)
|
||||
]
|
||||
|
||||
async_tbl = await async_db.create_table("documents_async", data=data)
|
||||
# --8<-- [end:create_table_async_with_nested_schema]
|
||||
# --8<-- [start:search_result_async_as_pyarrow]
|
||||
await async_tbl.query().nearest_to(np.random.randn(1536)).to_arrow()
|
||||
# --8<-- [end:search_result_async_as_pyarrow]
|
||||
# --8<-- [start:search_result_async_as_pandas]
|
||||
await async_tbl.query().nearest_to(np.random.randn(1536)).to_pandas()
|
||||
# --8<-- [end:search_result_async_as_pandas]
|
||||
# --8<-- [start:search_result_async_as_list]
|
||||
await async_tbl.query().nearest_to(np.random.randn(1536)).to_list()
|
||||
# --8<-- [end:search_result_async_as_list]
|
||||
|
||||
|
||||
def test_fts_native():
|
||||
# --8<-- [start:basic_fts]
|
||||
uri = "data/sample-lancedb"
|
||||
db = lancedb.connect(uri)
|
||||
|
||||
table = db.create_table(
|
||||
"my_table_fts",
|
||||
data=[
|
||||
{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"},
|
||||
{"vector": [5.9, 26.5], "text": "There are several kittens playing"},
|
||||
],
|
||||
)
|
||||
|
||||
# passing `use_tantivy=False` to use lance FTS index
|
||||
# `use_tantivy=True` by default
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.search("puppy").limit(10).select(["text"]).to_list()
|
||||
# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}]
|
||||
# ...
|
||||
# --8<-- [end:basic_fts]
|
||||
# --8<-- [start:fts_config_stem]
|
||||
table.create_fts_index("text", tokenizer_name="en_stem", replace=True)
|
||||
# --8<-- [end:fts_config_stem]
|
||||
# --8<-- [start:fts_config_folding]
|
||||
table.create_fts_index(
|
||||
"text",
|
||||
use_tantivy=False,
|
||||
language="French",
|
||||
stem=True,
|
||||
ascii_folding=True,
|
||||
replace=True,
|
||||
)
|
||||
# --8<-- [end:fts_config_folding]
|
||||
# --8<-- [start:fts_prefiltering]
|
||||
table.search("puppy").limit(10).where("text='foo'", prefilter=True).to_list()
|
||||
# --8<-- [end:fts_prefiltering]
|
||||
# --8<-- [start:fts_postfiltering]
|
||||
table.search("puppy").limit(10).where("text='foo'", prefilter=False).to_list()
|
||||
# --8<-- [end:fts_postfiltering]
|
||||
# --8<-- [start:fts_with_position]
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
|
||||
# --8<-- [end:fts_with_position]
|
||||
# --8<-- [start:fts_incremental_index]
|
||||
table.add([{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"}])
|
||||
table.optimize()
|
||||
# --8<-- [end:fts_incremental_index]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fts_native_async():
|
||||
# --8<-- [start:basic_fts_async]
|
||||
uri = "data/sample-lancedb"
|
||||
async_db = await lancedb.connect_async(uri)
|
||||
|
||||
async_tbl = await async_db.create_table(
|
||||
"my_table_fts_async",
|
||||
data=[
|
||||
{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"},
|
||||
{"vector": [5.9, 26.5], "text": "There are several kittens playing"},
|
||||
],
|
||||
)
|
||||
|
||||
# async API uses our native FTS algorithm
|
||||
await async_tbl.create_index("text", config=FTS())
|
||||
await (
|
||||
async_tbl.query().nearest_to_text("puppy").select(["text"]).limit(10).to_list()
|
||||
)
|
||||
# [{'text': 'Frodo was a happy puppy', '_score': 0.6931471824645996}]
|
||||
# ...
|
||||
# --8<-- [end:basic_fts_async]
|
||||
# --8<-- [start:fts_config_stem_async]
|
||||
await async_tbl.create_index(
|
||||
"text", config=FTS(language="English", stem=True, remove_stop_words=True)
|
||||
) # --8<-- [end:fts_config_stem_async]
|
||||
# --8<-- [start:fts_config_folding_async]
|
||||
await async_tbl.create_index(
|
||||
"text", config=FTS(language="French", stem=True, ascii_folding=True)
|
||||
)
|
||||
# --8<-- [end:fts_config_folding_async]
|
||||
# --8<-- [start:fts_prefiltering_async]
|
||||
await (
|
||||
async_tbl.query()
|
||||
.nearest_to_text("puppy")
|
||||
.limit(10)
|
||||
.where("text='foo'")
|
||||
.to_list()
|
||||
)
|
||||
# --8<-- [end:fts_prefiltering_async]
|
||||
# --8<-- [start:fts_postfiltering_async]
|
||||
await (
|
||||
async_tbl.query()
|
||||
.nearest_to_text("puppy")
|
||||
.limit(10)
|
||||
.where("text='foo'")
|
||||
.postfilter()
|
||||
.to_list()
|
||||
)
|
||||
# --8<-- [end:fts_postfiltering_async]
|
||||
# --8<-- [start:fts_with_position_async]
|
||||
await async_tbl.create_index("text", config=FTS(with_position=True))
|
||||
# --8<-- [end:fts_with_position_async]
|
||||
# --8<-- [start:fts_incremental_index_async]
|
||||
await async_tbl.add([{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"}])
|
||||
await async_tbl.optimize()
|
||||
# --8<-- [end:fts_incremental_index_async]
|
||||
|
||||
|
||||
@pytest.mark.skip()
|
||||
def test_hybrid_search():
|
||||
# --8<-- [start:import-openai]
|
||||
import openai
|
||||
|
||||
# --8<-- [end:import-openai]
|
||||
# --8<-- [start:openai-embeddings]
|
||||
# Ingest embedding function in LanceDB table
|
||||
# Configuring the environment variable OPENAI_API_KEY
|
||||
if "OPENAI_API_KEY" not in os.environ:
|
||||
# OR set the key here as a variable
|
||||
openai.api_key = "sk-..."
|
||||
embeddings = get_registry().get("openai").create()
|
||||
|
||||
# --8<-- [end:openai-embeddings]
|
||||
# --8<-- [start:class-Documents]
|
||||
class Documents(LanceModel):
|
||||
vector: Vector(embeddings.ndims()) = embeddings.VectorField()
|
||||
text: str = embeddings.SourceField()
|
||||
|
||||
# --8<-- [end:class-Documents]
|
||||
# --8<-- [start:basic_hybrid_search]
|
||||
data = [
|
||||
{"text": "rebel spaceships striking from a hidden base"},
|
||||
{"text": "have won their first victory against the evil Galactic Empire"},
|
||||
{"text": "during the battle rebel spies managed to steal secret plans"},
|
||||
{"text": "to the Empire's ultimate weapon the Death Star"},
|
||||
]
|
||||
uri = "data/sample-lancedb"
|
||||
db = lancedb.connect(uri)
|
||||
table = db.create_table("documents", schema=Documents)
|
||||
# ingest docs with auto-vectorization
|
||||
table.add(data)
|
||||
# Create a fts index before the hybrid search
|
||||
table.create_fts_index("text")
|
||||
# hybrid search with default re-ranker
|
||||
table.search("flower moon", query_type="hybrid").to_pandas()
|
||||
# --8<-- [end:basic_hybrid_search]
|
||||
# --8<-- [start:hybrid_search_pass_vector_text]
|
||||
vector_query = [0.1, 0.2, 0.3, 0.4, 0.5]
|
||||
text_query = "flower moon"
|
||||
(
|
||||
table.search(query_type="hybrid")
|
||||
.vector(vector_query)
|
||||
.text(text_query)
|
||||
.limit(5)
|
||||
.to_pandas()
|
||||
)
|
||||
# --8<-- [end:hybrid_search_pass_vector_text]
|
||||
|
||||
|
||||
@pytest.mark.skip
|
||||
async def test_hybrid_search_async():
|
||||
import openai
|
||||
|
||||
# --8<-- [start:openai-embeddings]
|
||||
# Ingest embedding function in LanceDB table
|
||||
# Configuring the environment variable OPENAI_API_KEY
|
||||
if "OPENAI_API_KEY" not in os.environ:
|
||||
# OR set the key here as a variable
|
||||
openai.api_key = "sk-..."
|
||||
embeddings = get_registry().get("openai").create()
|
||||
|
||||
# --8<-- [end:openai-embeddings]
|
||||
# --8<-- [start:class-Documents]
|
||||
class Documents(LanceModel):
|
||||
vector: Vector(embeddings.ndims()) = embeddings.VectorField()
|
||||
text: str = embeddings.SourceField()
|
||||
|
||||
# --8<-- [end:class-Documents]
|
||||
# --8<-- [start:basic_hybrid_search_async]
|
||||
uri = "data/sample-lancedb"
|
||||
async_db = await lancedb.connect_async(uri)
|
||||
data = [
|
||||
{"text": "rebel spaceships striking from a hidden base"},
|
||||
{"text": "have won their first victory against the evil Galactic Empire"},
|
||||
{"text": "during the battle rebel spies managed to steal secret plans"},
|
||||
{"text": "to the Empire's ultimate weapon the Death Star"},
|
||||
]
|
||||
async_tbl = await async_db.create_table("documents_async", schema=Documents)
|
||||
# ingest docs with auto-vectorization
|
||||
await async_tbl.add(data)
|
||||
# Create a fts index before the hybrid search
|
||||
await async_tbl.create_index("text", config=FTS())
|
||||
text_query = "flower moon"
|
||||
vector_query = embeddings.compute_query_embeddings(text_query)[0]
|
||||
# hybrid search with default re-ranker
|
||||
await (
|
||||
async_tbl.query()
|
||||
.nearest_to(vector_query)
|
||||
.nearest_to_text(text_query)
|
||||
.to_pandas()
|
||||
)
|
||||
# --8<-- [end:basic_hybrid_search_async]
|
||||
# --8<-- [start:hybrid_search_pass_vector_text_async]
|
||||
vector_query = [0.1, 0.2, 0.3, 0.4, 0.5]
|
||||
text_query = "flower moon"
|
||||
await (
|
||||
async_tbl.query()
|
||||
.nearest_to(vector_query)
|
||||
.nearest_to_text(text_query)
|
||||
.limit(5)
|
||||
.to_pandas()
|
||||
)
|
||||
# --8<-- [end:hybrid_search_pass_vector_text_async]
|
||||
Reference in New Issue
Block a user