Files
lancedb/python/python/tests/docs/test_binary_vector.py
Will Jones ecdee4d2b1 feat(python): add search() method to async API (#2049)
Reviving #1966.

Closes #1938

The `search()` method can apply embeddings for the user. This simplifies
hybrid search, so instead of writing:

```python
vector_query = embeddings.compute_query_embeddings("flower moon")[0]
await (
    async_tbl.query()
    .nearest_to(vector_query)
    .nearest_to_text("flower moon")
    .to_pandas()
)
```

You can write:

```python
await (await async_tbl.search("flower moon", query_type="hybrid")).to_pandas()
```

Unfortunately, we had to do a double-await here because `search()` needs
to be async. This is because it often needs to do IO to retrieve and run
an embedding function.
2025-02-24 14:19:25 -08:00

81 lines
2.4 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
import shutil
# --8<-- [start:imports]
import lancedb
import numpy as np
import pyarrow as pa
import pytest
# --8<-- [end:imports]
shutil.rmtree("data/binary_lancedb", ignore_errors=True)
def test_binary_vector():
# --8<-- [start:sync_binary_vector]
db = lancedb.connect("data/binary_lancedb")
schema = pa.schema(
[
pa.field("id", pa.int64()),
# for dim=256, lance stores every 8 bits in a byte
# so the vector field should be a list of 256 / 8 = 32 bytes
pa.field("vector", pa.list_(pa.uint8(), 32)),
]
)
tbl = db.create_table("my_binary_vectors", schema=schema)
data = []
for i in range(1024):
vector = np.random.randint(0, 2, size=256)
# pack the binary vector into bytes to save space
packed_vector = np.packbits(vector)
data.append(
{
"id": i,
"vector": packed_vector,
}
)
tbl.add(data)
query = np.random.randint(0, 2, size=256)
packed_query = np.packbits(query)
tbl.search(packed_query).distance_type("hamming").to_arrow()
# --8<-- [end:sync_binary_vector]
db.drop_table("my_binary_vectors")
@pytest.mark.asyncio
async def test_binary_vector_async():
# --8<-- [start:async_binary_vector]
db = await lancedb.connect_async("data/binary_lancedb")
schema = pa.schema(
[
pa.field("id", pa.int64()),
# for dim=256, lance stores every 8 bits in a byte
# so the vector field should be a list of 256 / 8 = 32 bytes
pa.field("vector", pa.list_(pa.uint8(), 32)),
]
)
tbl = await db.create_table("my_binary_vectors", schema=schema)
data = []
for i in range(1024):
vector = np.random.randint(0, 2, size=256)
# pack the binary vector into bytes to save space
packed_vector = np.packbits(vector)
data.append(
{
"id": i,
"vector": packed_vector,
}
)
await tbl.add(data)
query = np.random.randint(0, 2, size=256)
packed_query = np.packbits(query)
await (await tbl.search(packed_query)).distance_type("hamming").to_arrow()
# --8<-- [end:async_binary_vector]
await db.drop_table("my_binary_vectors")