[python] Support schema evolution in local LanceDB (#452)

Previously if you needed to add a column to a table you'd have to
rewrite the whole table. Instead,
we use the merge functionality from Lance format
to incrementally add columns from another table
or dataframe.

---------

Co-authored-by: Chang She <chang@lancedb.com>
Co-authored-by: Weston Pace <weston.pace@gmail.com>
This commit is contained in:
Chang She
2023-08-24 14:40:49 -07:00
committed by GitHub
parent 2f1f9f6338
commit e587a17a64
2 changed files with 86 additions and 1 deletions

View File

@@ -16,6 +16,7 @@ from pathlib import Path
from typing import List
from unittest.mock import PropertyMock, patch
import lance
import numpy as np
import pandas as pd
import pyarrow as pa
@@ -295,3 +296,23 @@ def test_restore(db):
with pytest.raises(ValueError):
table.restore(0)
def test_merge(db, tmp_path):
table = LanceTable.create(
db,
"my_table",
data=[{"vector": [1.1, 0.9], "id": 0}, {"vector": [1.2, 1.9], "id": 1}],
)
other_table = pa.table({"document": ["foo", "bar"], "id": [0, 1]})
table.merge(other_table, left_on="id")
assert len(table.list_versions()) == 2
expected = pa.table(
{"vector": [[1.1, 0.9], [1.2, 1.9]], "id": [0, 1], "document": ["foo", "bar"]},
schema=table.schema,
)
assert table.to_arrow() == expected
other_dataset = lance.write_dataset(other_table, tmp_path / "other_table.lance")
table.restore(1)
table.merge(other_dataset, left_on="id")