mirror of
https://github.com/lancedb/lancedb.git
synced 2026-06-09 15:20:41 +00:00
Previously, users could only specify new data types in `alterColumns` as
strings:
```ts
await tbl.alterColumns([
path: "price",
dataType: "float"
]);
```
But this has some problems:
1. It wasn't clear what were valid types
2. It was impossible to specify nested types, like lists and vector
columns.
This PR changes it to take an Arrow data type, similar to how the Python
API works. This allows casting vector types:
```ts
await tbl.alterColumns([
{
path: "vector",
dataType: new arrow.FixedSizeList(
2,
new arrow.Field("item", new arrow.Float16(), false),
),
},
]);
```
Closes #2185
209 lines
6.5 KiB
Python
209 lines
6.5 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
|
|
# --8<-- [start:imports]
|
|
import lancedb
|
|
import pandas as pd
|
|
import pyarrow as pa
|
|
# --8<-- [end:imports]
|
|
|
|
import pytest
|
|
from numpy.random import randint, random
|
|
|
|
|
|
def test_quickstart(tmp_path):
|
|
# --8<-- [start:set_uri]
|
|
uri = "data/sample-lancedb"
|
|
# --8<-- [end:set_uri]
|
|
uri = tmp_path
|
|
# --8<-- [start:connect]
|
|
db = lancedb.connect(uri)
|
|
# --8<-- [end:connect]
|
|
|
|
# --8<-- [start:create_table]
|
|
data = [
|
|
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
|
]
|
|
|
|
tbl = db.create_table("my_table", data=data)
|
|
# --8<-- [end:create_table]
|
|
|
|
# --8<-- [start:create_table_pandas]
|
|
df = pd.DataFrame(
|
|
[
|
|
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
|
]
|
|
)
|
|
tbl = db.create_table("table_from_df", data=df)
|
|
# --8<-- [end:create_table_pandas]
|
|
|
|
# --8<-- [start:create_empty_table]
|
|
schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), list_size=2))])
|
|
tbl = db.create_table("empty_table", schema=schema)
|
|
# --8<-- [end:create_empty_table]
|
|
# --8<-- [start:open_table]
|
|
tbl = db.open_table("my_table")
|
|
# --8<-- [end:open_table]
|
|
# --8<-- [start:table_names]
|
|
print(db.table_names())
|
|
# --8<-- [end:table_names]
|
|
# --8<-- [start:add_data]
|
|
# Option 1: Add a list of dicts to a table
|
|
data = [
|
|
{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0},
|
|
{"vector": [9.5, 56.2], "item": "buzz", "price": 200.0},
|
|
]
|
|
tbl.add(data)
|
|
|
|
# Option 2: Add a pandas DataFrame to a table
|
|
df = pd.DataFrame(data)
|
|
tbl.add(data)
|
|
# --8<-- [end:add_data]
|
|
# --8<-- [start:vector_search]
|
|
tbl.search([100, 100]).limit(2).to_pandas()
|
|
# --8<-- [end:vector_search]
|
|
tbl.add(
|
|
[
|
|
{"vector": random(2), "item": "autogen", "price": randint(100)}
|
|
for _ in range(1000)
|
|
]
|
|
)
|
|
# --8<-- [start:add_columns]
|
|
tbl.add_columns({"double_price": "cast((price * 2) as float)"})
|
|
# --8<-- [end:add_columns]
|
|
# --8<-- [start:alter_columns]
|
|
tbl.alter_columns(
|
|
{
|
|
"path": "double_price",
|
|
"rename": "dbl_price",
|
|
"data_type": pa.float64(),
|
|
"nullable": True,
|
|
}
|
|
)
|
|
# --8<-- [end:alter_columns]
|
|
# --8<-- [start:alter_columns_vector]
|
|
tbl.alter_columns(
|
|
{
|
|
"path": "vector",
|
|
"data_type": pa.list_(pa.float16(), list_size=2),
|
|
}
|
|
)
|
|
# --8<-- [end:alter_columns_vector]
|
|
# Change it back since we can get a panic with fp16
|
|
tbl.alter_columns(
|
|
{
|
|
"path": "vector",
|
|
"data_type": pa.list_(pa.float32(), list_size=2),
|
|
}
|
|
)
|
|
# --8<-- [start:drop_columns]
|
|
tbl.drop_columns(["dbl_price"])
|
|
# --8<-- [end:drop_columns]
|
|
# --8<-- [start:create_index]
|
|
tbl.create_index(num_sub_vectors=1)
|
|
# --8<-- [end:create_index]
|
|
# --8<-- [start:delete_rows]
|
|
tbl.delete('item = "fizz"')
|
|
# --8<-- [end:delete_rows]
|
|
# --8<-- [start:drop_table]
|
|
db.drop_table("my_table")
|
|
# --8<-- [end:drop_table]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_quickstart_async(tmp_path):
|
|
uri = tmp_path
|
|
# --8<-- [start:connect_async]
|
|
db = await lancedb.connect_async(uri)
|
|
# --8<-- [end:connect_async]
|
|
# --8<-- [start:create_table_async]
|
|
data = [
|
|
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
|
]
|
|
|
|
tbl = await db.create_table("my_table_async", data=data)
|
|
# --8<-- [end:create_table_async]
|
|
# --8<-- [start:create_table_async_pandas]
|
|
df = pd.DataFrame(
|
|
[
|
|
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
|
]
|
|
)
|
|
|
|
tbl = await db.create_table("table_from_df_async", df)
|
|
# --8<-- [end:create_table_async_pandas]
|
|
# --8<-- [start:create_empty_table_async]
|
|
schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), list_size=2))])
|
|
tbl = await db.create_table("empty_table_async", schema=schema)
|
|
# --8<-- [end:create_empty_table_async]
|
|
# --8<-- [start:open_table_async]
|
|
tbl = await db.open_table("my_table_async")
|
|
# --8<-- [end:open_table_async]
|
|
# --8<-- [start:table_names_async]
|
|
print(await db.table_names())
|
|
# --8<-- [end:table_names_async]
|
|
# --8<-- [start:add_data_async]
|
|
# Option 1: Add a list of dicts to a table
|
|
data = [
|
|
{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0},
|
|
{"vector": [9.5, 56.2], "item": "buzz", "price": 200.0},
|
|
]
|
|
await tbl.add(data)
|
|
|
|
# Option 2: Add a pandas DataFrame to a table
|
|
df = pd.DataFrame(data)
|
|
await tbl.add(data)
|
|
# --8<-- [end:add_data_async]
|
|
# Add sufficient data for training
|
|
data = [{"vector": [x, x], "item": "filler", "price": x * x} for x in range(1000)]
|
|
await tbl.add(data)
|
|
# --8<-- [start:vector_search_async]
|
|
await tbl.vector_search([100, 100]).limit(2).to_pandas()
|
|
# --8<-- [end:vector_search_async]
|
|
# --8<-- [start:add_columns_async]
|
|
await tbl.add_columns({"double_price": "cast((price * 2) as float)"})
|
|
# --8<-- [end:add_columns_async]
|
|
# --8<-- [start:alter_columns_async]
|
|
await tbl.alter_columns(
|
|
{
|
|
"path": "double_price",
|
|
"rename": "dbl_price",
|
|
"data_type": pa.float64(),
|
|
"nullable": True,
|
|
}
|
|
)
|
|
# --8<-- [end:alter_columns_async]
|
|
# --8<-- [start:alter_columns_async_vector]
|
|
await tbl.alter_columns(
|
|
{
|
|
"path": "vector",
|
|
"data_type": pa.list_(pa.float16(), list_size=2),
|
|
}
|
|
)
|
|
# --8<-- [end:alter_columns_async_vector]
|
|
# Change it back since we can get a panic with fp16
|
|
await tbl.alter_columns(
|
|
{
|
|
"path": "vector",
|
|
"data_type": pa.list_(pa.float32(), list_size=2),
|
|
}
|
|
)
|
|
# --8<-- [start:drop_columns_async]
|
|
await tbl.drop_columns(["dbl_price"])
|
|
# --8<-- [end:drop_columns_async]
|
|
await tbl.vector_search([100, 100]).limit(2).to_pandas()
|
|
# --8<-- [end:vector_search_async]
|
|
# --8<-- [start:create_index_async]
|
|
await tbl.create_index("vector")
|
|
# --8<-- [end:create_index_async]
|
|
# --8<-- [start:delete_rows_async]
|
|
await tbl.delete('item = "fizz"')
|
|
# --8<-- [end:delete_rows_async]
|
|
# --8<-- [start:drop_table_async]
|
|
await db.drop_table("my_table_async")
|
|
# --8<-- [end:drop_table_async]
|