mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-04 02:42:57 +00:00
Make creating (and adding to) tables via Iterators more flexible & intuitive (#430)
It improves the UX as iterators can be of any type supported by the
table (plus recordbatch) & there is no separate requirement.
Also expands the test cases for pydantic & arrow schema.
If this is looks good I'll update the docs.
Example usage:
```
class Content(LanceModel):
vector: vector(2)
item: str
price: float
def make_batches():
for _ in range(5):
yield from [
# pandas
pd.DataFrame({
"vector": [[3.1, 4.1], [1, 1]],
"item": ["foo", "bar"],
"price": [10.0, 20.0],
}),
# pylist
[
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
],
# recordbatch
pa.RecordBatch.from_arrays(
[
pa.array([[3.1, 4.1], [5.9, 26.5]], pa.list_(pa.float32(), 2)),
pa.array(["foo", "bar"]),
pa.array([10.0, 20.0]),
],
["vector", "item", "price"],
),
# pydantic list
[
Content(vector=[3.1, 4.1], item="foo", price=10.0),
Content(vector=[5.9, 26.5], item="bar", price=20.0),
]]
db = lancedb.connect("db")
tbl = db.create_table("tabley", make_batches(), schema=Content, mode="overwrite")
tbl.add(make_batches())
```
Same should with arrow schema.
---------
Co-authored-by: Weston Pace <weston.pace@gmail.com>
This commit is contained in:
@@ -17,7 +17,7 @@ import pyarrow as pa
|
||||
import pytest
|
||||
|
||||
import lancedb
|
||||
from lancedb.pydantic import LanceModel
|
||||
from lancedb.pydantic import LanceModel, vector
|
||||
|
||||
|
||||
def test_basic(tmp_path):
|
||||
@@ -77,35 +77,78 @@ def test_ingest_pd(tmp_path):
|
||||
assert db.open_table("test").name == db["test"].name
|
||||
|
||||
|
||||
def test_ingest_record_batch_iterator(tmp_path):
|
||||
def batch_reader():
|
||||
for i in range(5):
|
||||
yield pa.RecordBatch.from_arrays(
|
||||
[
|
||||
pa.array([[3.1, 4.1], [5.9, 26.5]]),
|
||||
pa.array(["foo", "bar"]),
|
||||
pa.array([10.0, 20.0]),
|
||||
],
|
||||
["vector", "item", "price"],
|
||||
)
|
||||
def test_ingest_iterator(tmp_path):
|
||||
class PydanticSchema(LanceModel):
|
||||
vector: vector(2)
|
||||
item: str
|
||||
price: float
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
tbl = db.create_table(
|
||||
"test",
|
||||
batch_reader(),
|
||||
schema=pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32())),
|
||||
pa.field("item", pa.utf8()),
|
||||
pa.field("price", pa.float32()),
|
||||
]
|
||||
),
|
||||
arrow_schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
pa.field("item", pa.utf8()),
|
||||
pa.field("price", pa.float32()),
|
||||
]
|
||||
)
|
||||
|
||||
tbl_len = len(tbl)
|
||||
tbl.add(batch_reader())
|
||||
assert len(tbl) == tbl_len * 2
|
||||
assert len(tbl.list_versions()) == 2
|
||||
def make_batches():
|
||||
for _ in range(5):
|
||||
yield from [
|
||||
# pandas
|
||||
pd.DataFrame(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [1, 1]],
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0],
|
||||
}
|
||||
),
|
||||
# pylist
|
||||
[
|
||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||
],
|
||||
# recordbatch
|
||||
pa.RecordBatch.from_arrays(
|
||||
[
|
||||
pa.array([[3.1, 4.1], [5.9, 26.5]], pa.list_(pa.float32(), 2)),
|
||||
pa.array(["foo", "bar"]),
|
||||
pa.array([10.0, 20.0]),
|
||||
],
|
||||
["vector", "item", "price"],
|
||||
),
|
||||
# pa Table
|
||||
pa.Table.from_arrays(
|
||||
[
|
||||
pa.array([[3.1, 4.1], [5.9, 26.5]], pa.list_(pa.float32(), 2)),
|
||||
pa.array(["foo", "bar"]),
|
||||
pa.array([10.0, 20.0]),
|
||||
],
|
||||
["vector", "item", "price"],
|
||||
),
|
||||
# pydantic list
|
||||
[
|
||||
PydanticSchema(vector=[3.1, 4.1], item="foo", price=10.0),
|
||||
PydanticSchema(vector=[5.9, 26.5], item="bar", price=20.0),
|
||||
]
|
||||
# TODO: test pydict separately. it is unique column number and names contraint
|
||||
]
|
||||
|
||||
def run_tests(schema):
|
||||
db = lancedb.connect(tmp_path)
|
||||
tbl = db.create_table("table2", make_batches(), schema=schema, mode="overwrite")
|
||||
|
||||
tbl.to_pandas()
|
||||
assert tbl.search([3.1, 4.1]).limit(1).to_df()["_distance"][0] == 0.0
|
||||
assert tbl.search([5.9, 26.5]).limit(1).to_df()["_distance"][0] == 0.0
|
||||
|
||||
tbl_len = len(tbl)
|
||||
tbl.add(make_batches())
|
||||
assert len(tbl) == tbl_len * 2
|
||||
assert len(tbl.list_versions()) == 2
|
||||
db.drop_database()
|
||||
|
||||
run_tests(arrow_schema)
|
||||
run_tests(PydanticSchema)
|
||||
|
||||
|
||||
def test_create_mode(tmp_path):
|
||||
|
||||
Reference in New Issue
Block a user