mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-27 23:12:58 +00:00
feat: support shallow clone (#2653)
Support shallow cloning a dataset at a specific location to create a new dataset, using the shallow_clone feature in Lance. Also introduce remote `clone` API for remote tables for this functionality.
This commit is contained in:
@@ -60,6 +60,15 @@ class Connection(object):
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
) -> Table: ...
|
||||
async def clone_table(
|
||||
self,
|
||||
target_table_name: str,
|
||||
source_uri: str,
|
||||
target_namespace: List[str] = [],
|
||||
source_version: Optional[int] = None,
|
||||
source_tag: Optional[str] = None,
|
||||
is_shallow: bool = True,
|
||||
) -> Table: ...
|
||||
async def rename_table(
|
||||
self,
|
||||
cur_name: str,
|
||||
|
||||
@@ -665,6 +665,60 @@ class LanceDBConnection(DBConnection):
|
||||
index_cache_size=index_cache_size,
|
||||
)
|
||||
|
||||
def clone_table(
|
||||
self,
|
||||
target_table_name: str,
|
||||
source_uri: str,
|
||||
*,
|
||||
target_namespace: List[str] = [],
|
||||
source_version: Optional[int] = None,
|
||||
source_tag: Optional[str] = None,
|
||||
is_shallow: bool = True,
|
||||
) -> LanceTable:
|
||||
"""Clone a table from a source table.
|
||||
|
||||
A shallow clone creates a new table that shares the underlying data files
|
||||
with the source table but has its own independent manifest. This allows
|
||||
both the source and cloned tables to evolve independently while initially
|
||||
sharing the same data, deletion, and index files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target_table_name: str
|
||||
The name of the target table to create.
|
||||
source_uri: str
|
||||
The URI of the source table to clone from.
|
||||
target_namespace: List[str], optional
|
||||
The namespace for the target table.
|
||||
None or empty list represents root namespace.
|
||||
source_version: int, optional
|
||||
The version of the source table to clone.
|
||||
source_tag: str, optional
|
||||
The tag of the source table to clone.
|
||||
is_shallow: bool, default True
|
||||
Whether to perform a shallow clone (True) or deep clone (False).
|
||||
Currently only shallow clone is supported.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A LanceTable object representing the cloned table.
|
||||
"""
|
||||
LOOP.run(
|
||||
self._conn.clone_table(
|
||||
target_table_name,
|
||||
source_uri,
|
||||
target_namespace=target_namespace,
|
||||
source_version=source_version,
|
||||
source_tag=source_tag,
|
||||
is_shallow=is_shallow,
|
||||
)
|
||||
)
|
||||
return LanceTable.open(
|
||||
self,
|
||||
target_table_name,
|
||||
namespace=target_namespace,
|
||||
)
|
||||
|
||||
@override
|
||||
def drop_table(
|
||||
self,
|
||||
@@ -1136,6 +1190,54 @@ class AsyncConnection(object):
|
||||
)
|
||||
return AsyncTable(table)
|
||||
|
||||
async def clone_table(
|
||||
self,
|
||||
target_table_name: str,
|
||||
source_uri: str,
|
||||
*,
|
||||
target_namespace: List[str] = [],
|
||||
source_version: Optional[int] = None,
|
||||
source_tag: Optional[str] = None,
|
||||
is_shallow: bool = True,
|
||||
) -> AsyncTable:
|
||||
"""Clone a table from a source table.
|
||||
|
||||
A shallow clone creates a new table that shares the underlying data files
|
||||
with the source table but has its own independent manifest. This allows
|
||||
both the source and cloned tables to evolve independently while initially
|
||||
sharing the same data, deletion, and index files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target_table_name: str
|
||||
The name of the target table to create.
|
||||
source_uri: str
|
||||
The URI of the source table to clone from.
|
||||
target_namespace: List[str], optional
|
||||
The namespace for the target table.
|
||||
None or empty list represents root namespace.
|
||||
source_version: int, optional
|
||||
The version of the source table to clone.
|
||||
source_tag: str, optional
|
||||
The tag of the source table to clone.
|
||||
is_shallow: bool, default True
|
||||
Whether to perform a shallow clone (True) or deep clone (False).
|
||||
Currently only shallow clone is supported.
|
||||
|
||||
Returns
|
||||
-------
|
||||
An AsyncTable object representing the cloned table.
|
||||
"""
|
||||
table = await self._inner.clone_table(
|
||||
target_table_name,
|
||||
source_uri,
|
||||
target_namespace=target_namespace,
|
||||
source_version=source_version,
|
||||
source_tag=source_tag,
|
||||
is_shallow=is_shallow,
|
||||
)
|
||||
return AsyncTable(table)
|
||||
|
||||
async def rename_table(
|
||||
self,
|
||||
cur_name: str,
|
||||
|
||||
@@ -212,6 +212,53 @@ class RemoteDBConnection(DBConnection):
|
||||
table = LOOP.run(self._conn.open_table(name, namespace=namespace))
|
||||
return RemoteTable(table, self.db_name)
|
||||
|
||||
def clone_table(
|
||||
self,
|
||||
target_table_name: str,
|
||||
source_uri: str,
|
||||
*,
|
||||
target_namespace: List[str] = [],
|
||||
source_version: Optional[int] = None,
|
||||
source_tag: Optional[str] = None,
|
||||
is_shallow: bool = True,
|
||||
) -> Table:
|
||||
"""Clone a table from a source table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target_table_name: str
|
||||
The name of the target table to create.
|
||||
source_uri: str
|
||||
The URI of the source table to clone from.
|
||||
target_namespace: List[str], optional
|
||||
The namespace for the target table.
|
||||
None or empty list represents root namespace.
|
||||
source_version: int, optional
|
||||
The version of the source table to clone.
|
||||
source_tag: str, optional
|
||||
The tag of the source table to clone.
|
||||
is_shallow: bool, default True
|
||||
Whether to perform a shallow clone (True) or deep clone (False).
|
||||
Currently only shallow clone is supported.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A RemoteTable object representing the cloned table.
|
||||
"""
|
||||
from .table import RemoteTable
|
||||
|
||||
table = LOOP.run(
|
||||
self._conn.clone_table(
|
||||
target_table_name,
|
||||
source_uri,
|
||||
target_namespace=target_namespace,
|
||||
source_version=source_version,
|
||||
source_tag=source_tag,
|
||||
is_shallow=is_shallow,
|
||||
)
|
||||
)
|
||||
return RemoteTable(table, self.db_name)
|
||||
|
||||
@override
|
||||
def create_table(
|
||||
self,
|
||||
|
||||
@@ -831,3 +831,119 @@ def test_local_table_operations_with_namespace_raise_error(tmp_path):
|
||||
# Test table_names without namespace - should work normally
|
||||
tables_root = list(db.table_names())
|
||||
assert "test_table" in tables_root
|
||||
|
||||
|
||||
def test_clone_table_latest_version(tmp_path):
|
||||
"""Test cloning a table with the latest version (default behavior)"""
|
||||
import os
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
# Create source table with some data
|
||||
data = [
|
||||
{"id": 1, "text": "hello", "vector": [1.0, 2.0]},
|
||||
{"id": 2, "text": "world", "vector": [3.0, 4.0]},
|
||||
]
|
||||
source_table = db.create_table("source", data=data)
|
||||
|
||||
# Add more data to create a new version
|
||||
more_data = [{"id": 3, "text": "test", "vector": [5.0, 6.0]}]
|
||||
source_table.add(more_data)
|
||||
|
||||
# Clone the table (should get latest version with 3 rows)
|
||||
source_uri = os.path.join(tmp_path, "source.lance")
|
||||
cloned_table = db.clone_table("cloned", source_uri)
|
||||
|
||||
# Verify cloned table has all 3 rows
|
||||
assert cloned_table.count_rows() == 3
|
||||
assert "cloned" in db.table_names()
|
||||
|
||||
# Verify data matches
|
||||
cloned_data = cloned_table.to_pandas()
|
||||
assert len(cloned_data) == 3
|
||||
assert set(cloned_data["id"].tolist()) == {1, 2, 3}
|
||||
|
||||
|
||||
def test_clone_table_specific_version(tmp_path):
|
||||
"""Test cloning a table from a specific version"""
|
||||
import os
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
# Create source table with initial data
|
||||
data = [
|
||||
{"id": 1, "text": "hello", "vector": [1.0, 2.0]},
|
||||
{"id": 2, "text": "world", "vector": [3.0, 4.0]},
|
||||
]
|
||||
source_table = db.create_table("source", data=data)
|
||||
|
||||
# Get the initial version
|
||||
initial_version = source_table.version
|
||||
|
||||
# Add more data to create a new version
|
||||
more_data = [{"id": 3, "text": "test", "vector": [5.0, 6.0]}]
|
||||
source_table.add(more_data)
|
||||
|
||||
# Verify source now has 3 rows
|
||||
assert source_table.count_rows() == 3
|
||||
|
||||
# Clone from the initial version (should have only 2 rows)
|
||||
source_uri = os.path.join(tmp_path, "source.lance")
|
||||
cloned_table = db.clone_table("cloned", source_uri, source_version=initial_version)
|
||||
|
||||
# Verify cloned table has only the initial 2 rows
|
||||
assert cloned_table.count_rows() == 2
|
||||
cloned_data = cloned_table.to_pandas()
|
||||
assert set(cloned_data["id"].tolist()) == {1, 2}
|
||||
|
||||
|
||||
def test_clone_table_with_tag(tmp_path):
|
||||
"""Test cloning a table from a tagged version"""
|
||||
import os
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
# Create source table with initial data
|
||||
data = [
|
||||
{"id": 1, "text": "hello", "vector": [1.0, 2.0]},
|
||||
{"id": 2, "text": "world", "vector": [3.0, 4.0]},
|
||||
]
|
||||
source_table = db.create_table("source", data=data)
|
||||
|
||||
# Create a tag for the current version
|
||||
source_table.tags.create("v1.0", source_table.version)
|
||||
|
||||
# Add more data after the tag
|
||||
more_data = [{"id": 3, "text": "test", "vector": [5.0, 6.0]}]
|
||||
source_table.add(more_data)
|
||||
|
||||
# Verify source now has 3 rows
|
||||
assert source_table.count_rows() == 3
|
||||
|
||||
# Clone from the tagged version (should have only 2 rows)
|
||||
source_uri = os.path.join(tmp_path, "source.lance")
|
||||
cloned_table = db.clone_table("cloned", source_uri, source_tag="v1.0")
|
||||
|
||||
# Verify cloned table has only the tagged version's 2 rows
|
||||
assert cloned_table.count_rows() == 2
|
||||
cloned_data = cloned_table.to_pandas()
|
||||
assert set(cloned_data["id"].tolist()) == {1, 2}
|
||||
|
||||
|
||||
def test_clone_table_deep_clone_fails(tmp_path):
|
||||
"""Test that deep clone raises an unsupported error"""
|
||||
import os
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
# Create source table with some data
|
||||
data = [
|
||||
{"id": 1, "text": "hello", "vector": [1.0, 2.0]},
|
||||
{"id": 2, "text": "world", "vector": [3.0, 4.0]},
|
||||
]
|
||||
db.create_table("source", data=data)
|
||||
|
||||
# Try to create a deep clone (should fail)
|
||||
source_uri = os.path.join(tmp_path, "source.lance")
|
||||
with pytest.raises(Exception, match="Deep clone is not yet implemented"):
|
||||
db.clone_table("cloned", source_uri, is_shallow=False)
|
||||
|
||||
@@ -163,6 +163,34 @@ impl Connection {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (target_table_name, source_uri, target_namespace=vec![], source_version=None, source_tag=None, is_shallow=true))]
|
||||
pub fn clone_table(
|
||||
self_: PyRef<'_, Self>,
|
||||
target_table_name: String,
|
||||
source_uri: String,
|
||||
target_namespace: Vec<String>,
|
||||
source_version: Option<u64>,
|
||||
source_tag: Option<String>,
|
||||
is_shallow: bool,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
|
||||
let mut builder = inner.clone_table(target_table_name, source_uri);
|
||||
builder = builder.target_namespace(target_namespace);
|
||||
if let Some(version) = source_version {
|
||||
builder = builder.source_version(version);
|
||||
}
|
||||
if let Some(tag) = source_tag {
|
||||
builder = builder.source_tag(tag);
|
||||
}
|
||||
builder = builder.is_shallow(is_shallow);
|
||||
|
||||
future_into_py(self_.py(), async move {
|
||||
let table = builder.execute().await.infer_error()?;
|
||||
Ok(Table::new(table))
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (cur_name, new_name, cur_namespace=vec![], new_namespace=vec![]))]
|
||||
pub fn rename_table(
|
||||
self_: PyRef<'_, Self>,
|
||||
|
||||
Reference in New Issue
Block a user