feat: support namespace credentials vending (#2778)

Based on https://github.com/lancedb/lance/pull/4984

1. Bump to 1.0.0-beta.2
2. Use DirectoryNamespace in lance to perform all testing in python and
rust for much better coverage
3. Refactor `ListingDatabase` to be able to accept location and
namespace. This is because we have to leverage listing database (local
lancedb connection) for using namespace, namespace only resolves the
location and storage options but we don't want to bind all the way to
rust since user will plug-in namespace from python side. And thus
`ListingDatabase` needs to be able to accept location and namespace that
are created from namespace connection.
4. For credentials vending, we also pass storage options provider all
the way to rust layer, and the rust layer calls back to the python
function to fetch next storage option. This is exactly the same thing we
did in pylance.
This commit is contained in:
Jack Ye
2025-11-17 00:42:24 -08:00
committed by GitHub
parent c0cc58c156
commit e47f552a86
27 changed files with 1660 additions and 636 deletions

View File

@@ -5,352 +5,39 @@
import tempfile
import shutil
from typing import Dict, Optional
import pytest
import pyarrow as pa
import lancedb
from lance_namespace.namespace import NATIVE_IMPLS, LanceNamespace
from lance_namespace_urllib3_client.models import (
ListTablesRequest,
ListTablesResponse,
DescribeTableRequest,
DescribeTableResponse,
RegisterTableRequest,
RegisterTableResponse,
DeregisterTableRequest,
DeregisterTableResponse,
CreateTableRequest,
CreateTableResponse,
DropTableRequest,
DropTableResponse,
ListNamespacesRequest,
ListNamespacesResponse,
CreateNamespaceRequest,
CreateNamespaceResponse,
DropNamespaceRequest,
DropNamespaceResponse,
)
class TempNamespace(LanceNamespace):
"""A simple dictionary-backed namespace for testing."""
# Class-level storage to persist table registry across instances
_global_registry: Dict[str, Dict[str, str]] = {}
# Class-level storage for namespaces (supporting 1-level namespace)
_global_namespaces: Dict[str, set] = {}
def __init__(self, **properties):
"""Initialize the test namespace.
Args:
root: The root directory for tables (optional)
**properties: Additional configuration properties
"""
self.config = TempNamespaceConfig(properties)
# Use the root as a key to maintain separate registries per root
root = self.config.root
if root not in self._global_registry:
self._global_registry[root] = {}
if root not in self._global_namespaces:
self._global_namespaces[root] = set()
self.tables = self._global_registry[root] # Reference to shared registry
self.namespaces = self._global_namespaces[
root
] # Reference to shared namespaces
def namespace_id(self) -> str:
"""Return a human-readable unique identifier for this namespace instance.
Returns:
A unique identifier string based on the root directory
"""
return f"TempNamespace {{ root: '{self.config.root}' }}"
def list_tables(self, request: ListTablesRequest) -> ListTablesResponse:
"""List all tables in the namespace."""
if not request.id:
# List all tables in root namespace
tables = [name for name in self.tables.keys() if "." not in name]
else:
# List tables in specific namespace (1-level only)
if len(request.id) == 1:
namespace_name = request.id[0]
prefix = f"{namespace_name}."
tables = [
name[len(prefix) :]
for name in self.tables.keys()
if name.startswith(prefix)
]
else:
# Multi-level namespaces not supported
raise ValueError("Only 1-level namespaces are supported")
return ListTablesResponse(tables=tables)
def describe_table(self, request: DescribeTableRequest) -> DescribeTableResponse:
"""Describe a table by returning its location."""
if not request.id:
raise ValueError("Invalid table ID")
if len(request.id) == 1:
# Root namespace table
table_name = request.id[0]
elif len(request.id) == 2:
# Namespaced table (1-level namespace)
namespace_name, table_name = request.id
table_name = f"{namespace_name}.{table_name}"
else:
raise ValueError("Only 1-level namespaces are supported")
if table_name not in self.tables:
raise RuntimeError(f"Table does not exist: {table_name}")
table_uri = self.tables[table_name]
return DescribeTableResponse(location=table_uri)
def create_table(
self, request: CreateTableRequest, request_data: bytes
) -> CreateTableResponse:
"""Create a table in the namespace."""
if not request.id:
raise ValueError("Invalid table ID")
if len(request.id) == 1:
# Root namespace table
table_name = request.id[0]
table_uri = f"{self.config.root}/{table_name}.lance"
elif len(request.id) == 2:
# Namespaced table (1-level namespace)
namespace_name, base_table_name = request.id
# Add namespace to our namespace set
self.namespaces.add(namespace_name)
table_name = f"{namespace_name}.{base_table_name}"
table_uri = f"{self.config.root}/{namespace_name}/{base_table_name}.lance"
else:
raise ValueError("Only 1-level namespaces are supported")
# Check if table already exists
if table_name in self.tables:
if request.mode == "overwrite":
# Drop existing table for overwrite mode
del self.tables[table_name]
else:
raise RuntimeError(f"Table already exists: {table_name}")
# Parse the Arrow IPC stream to get the schema and create the actual table
import pyarrow.ipc as ipc
import io
import lance
import os
# Create directory if needed for namespaced tables
os.makedirs(os.path.dirname(table_uri), exist_ok=True)
# Read the IPC stream
reader = ipc.open_stream(io.BytesIO(request_data))
table = reader.read_all()
# Create the actual Lance table
lance.write_dataset(table, table_uri)
# Store the table mapping
self.tables[table_name] = table_uri
return CreateTableResponse(location=table_uri)
def drop_table(self, request: DropTableRequest) -> DropTableResponse:
"""Drop a table from the namespace."""
if not request.id:
raise ValueError("Invalid table ID")
if len(request.id) == 1:
# Root namespace table
table_name = request.id[0]
elif len(request.id) == 2:
# Namespaced table (1-level namespace)
namespace_name, base_table_name = request.id
table_name = f"{namespace_name}.{base_table_name}"
else:
raise ValueError("Only 1-level namespaces are supported")
if table_name not in self.tables:
raise RuntimeError(f"Table does not exist: {table_name}")
# Get the table URI
table_uri = self.tables[table_name]
# Delete the actual table files
import shutil
import os
if os.path.exists(table_uri):
shutil.rmtree(table_uri, ignore_errors=True)
# Remove from registry
del self.tables[table_name]
return DropTableResponse()
def register_table(self, request: RegisterTableRequest) -> RegisterTableResponse:
"""Register a table with the namespace."""
if not request.id or len(request.id) != 1:
raise ValueError("Invalid table ID")
if not request.location:
raise ValueError("Table location is required")
table_name = request.id[0]
self.tables[table_name] = request.location
return RegisterTableResponse()
def deregister_table(
self, request: DeregisterTableRequest
) -> DeregisterTableResponse:
"""Deregister a table from the namespace."""
if not request.id or len(request.id) != 1:
raise ValueError("Invalid table ID")
table_name = request.id[0]
if table_name not in self.tables:
raise RuntimeError(f"Table does not exist: {table_name}")
del self.tables[table_name]
return DeregisterTableResponse()
def list_namespaces(self, request: ListNamespacesRequest) -> ListNamespacesResponse:
"""List child namespaces."""
if not request.id:
# List root-level namespaces
namespaces = list(self.namespaces)
elif len(request.id) == 1:
# For 1-level namespace, there are no child namespaces
namespaces = []
else:
raise ValueError("Only 1-level namespaces are supported")
return ListNamespacesResponse(namespaces=namespaces)
def create_namespace(
self, request: CreateNamespaceRequest
) -> CreateNamespaceResponse:
"""Create a namespace."""
if not request.id:
raise ValueError("Invalid namespace ID")
if len(request.id) == 1:
# Create 1-level namespace
namespace_name = request.id[0]
self.namespaces.add(namespace_name)
# Create directory for the namespace
import os
namespace_dir = f"{self.config.root}/{namespace_name}"
os.makedirs(namespace_dir, exist_ok=True)
else:
raise ValueError("Only 1-level namespaces are supported")
return CreateNamespaceResponse()
def drop_namespace(self, request: DropNamespaceRequest) -> DropNamespaceResponse:
"""Drop a namespace."""
if not request.id:
raise ValueError("Invalid namespace ID")
if len(request.id) == 1:
# Drop 1-level namespace
namespace_name = request.id[0]
if namespace_name not in self.namespaces:
raise RuntimeError(f"Namespace does not exist: {namespace_name}")
# Check if namespace has any tables
prefix = f"{namespace_name}."
tables_in_namespace = [
name for name in self.tables.keys() if name.startswith(prefix)
]
if tables_in_namespace:
raise RuntimeError(
f"Cannot drop namespace '{namespace_name}': contains tables"
)
# Remove namespace
self.namespaces.remove(namespace_name)
# Remove directory
import shutil
import os
namespace_dir = f"{self.config.root}/{namespace_name}"
if os.path.exists(namespace_dir):
shutil.rmtree(namespace_dir, ignore_errors=True)
else:
raise ValueError("Only 1-level namespaces are supported")
return DropNamespaceResponse()
class TempNamespaceConfig:
"""Configuration for TestNamespace."""
ROOT = "root"
def __init__(self, properties: Optional[Dict[str, str]] = None):
"""Initialize configuration from properties.
Args:
properties: Dictionary of configuration properties
"""
if properties is None:
properties = {}
self._root = properties.get(self.ROOT, "/tmp")
@property
def root(self) -> str:
"""Get the namespace root directory."""
return self._root
NATIVE_IMPLS["temp"] = f"{TempNamespace.__module__}.TempNamespace"
class TestNamespaceConnection:
"""Test namespace-based LanceDB connection."""
"""Test namespace-based LanceDB connection using DirectoryNamespace."""
def setup_method(self):
"""Set up test fixtures."""
self.temp_dir = tempfile.mkdtemp()
# Clear the TestNamespace registry for this test
if self.temp_dir in TempNamespace._global_registry:
TempNamespace._global_registry[self.temp_dir].clear()
if self.temp_dir in TempNamespace._global_namespaces:
TempNamespace._global_namespaces[self.temp_dir].clear()
def teardown_method(self):
"""Clean up test fixtures."""
# Clear the TestNamespace registry
if self.temp_dir in TempNamespace._global_registry:
del TempNamespace._global_registry[self.temp_dir]
if self.temp_dir in TempNamespace._global_namespaces:
del TempNamespace._global_namespaces[self.temp_dir]
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_connect_namespace_test(self):
"""Test connecting to LanceDB through TestNamespace."""
# Connect using TestNamespace
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
"""Test connecting to LanceDB through DirectoryNamespace."""
# Connect using DirectoryNamespace
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
# Should be a LanceNamespaceDBConnection
assert isinstance(db, lancedb.LanceNamespaceDBConnection)
# Initially no tables
# Initially no tables in root
assert len(list(db.table_names())) == 0
def test_create_table_through_namespace(self):
"""Test creating a table through namespace."""
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
# Create a child namespace first
db.create_namespace(["test_ns"])
# Define schema for empty table
schema = pa.schema(
@@ -361,13 +48,15 @@ class TestNamespaceConnection:
]
)
# Create empty table
table = db.create_table("test_table", schema=schema)
# Create empty table in child namespace
table = db.create_table("test_table", schema=schema, namespace=["test_ns"])
assert table is not None
assert table.name == "test_table"
assert table.namespace == ["test_ns"]
assert table.id == "test_ns$test_table"
# Table should appear in namespace
table_names = list(db.table_names())
# Table should appear in child namespace
table_names = list(db.table_names(namespace=["test_ns"]))
assert "test_table" in table_names
assert len(table_names) == 1
@@ -378,21 +67,26 @@ class TestNamespaceConnection:
def test_open_table_through_namespace(self):
"""Test opening an existing table through namespace."""
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
# Create a table with schema
# Create a child namespace first
db.create_namespace(["test_ns"])
# Create a table with schema in child namespace
schema = pa.schema(
[
pa.field("id", pa.int64()),
pa.field("vector", pa.list_(pa.float32(), 2)),
]
)
db.create_table("test_table", schema=schema)
db.create_table("test_table", schema=schema, namespace=["test_ns"])
# Open the table
table = db.open_table("test_table")
table = db.open_table("test_table", namespace=["test_ns"])
assert table is not None
assert table.name == "test_table"
assert table.namespace == ["test_ns"]
assert table.id == "test_ns$test_table"
# Verify empty table with correct schema
result = table.to_pandas()
@@ -401,44 +95,50 @@ class TestNamespaceConnection:
def test_drop_table_through_namespace(self):
"""Test dropping a table through namespace."""
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
# Create tables
# Create a child namespace first
db.create_namespace(["test_ns"])
# Create tables in child namespace
schema = pa.schema(
[
pa.field("id", pa.int64()),
pa.field("vector", pa.list_(pa.float32(), 2)),
]
)
db.create_table("table1", schema=schema)
db.create_table("table2", schema=schema)
db.create_table("table1", schema=schema, namespace=["test_ns"])
db.create_table("table2", schema=schema, namespace=["test_ns"])
# Verify both tables exist
table_names = list(db.table_names())
# Verify both tables exist in child namespace
table_names = list(db.table_names(namespace=["test_ns"]))
assert "table1" in table_names
assert "table2" in table_names
assert len(table_names) == 2
# Drop one table
db.drop_table("table1")
db.drop_table("table1", namespace=["test_ns"])
# Verify only table2 remains
table_names = list(db.table_names())
table_names = list(db.table_names(namespace=["test_ns"]))
assert "table1" not in table_names
assert "table2" in table_names
assert len(table_names) == 1
# Test that drop_table works without explicit namespace parameter
db.drop_table("table2")
assert len(list(db.table_names())) == 0
# Drop the second table
db.drop_table("table2", namespace=["test_ns"])
assert len(list(db.table_names(namespace=["test_ns"]))) == 0
# Should not be able to open dropped table
with pytest.raises(RuntimeError):
db.open_table("table1")
db.open_table("table1", namespace=["test_ns"])
def test_create_table_with_schema(self):
"""Test creating a table with explicit schema through namespace."""
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
# Create a child namespace first
db.create_namespace(["test_ns"])
# Define schema
schema = pa.schema(
@@ -449,9 +149,10 @@ class TestNamespaceConnection:
]
)
# Create table with schema
table = db.create_table("test_table", schema=schema)
# Create table with schema in child namespace
table = db.create_table("test_table", schema=schema, namespace=["test_ns"])
assert table is not None
assert table.namespace == ["test_ns"]
# Verify schema
table_schema = table.schema
@@ -461,16 +162,19 @@ class TestNamespaceConnection:
def test_rename_table_not_supported(self):
"""Test that rename_table raises NotImplementedError."""
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
# Create a table
# Create a child namespace first
db.create_namespace(["test_ns"])
# Create a table in child namespace
schema = pa.schema(
[
pa.field("id", pa.int64()),
pa.field("vector", pa.list_(pa.float32(), 2)),
]
)
db.create_table("old_name", schema=schema)
db.create_table("old_name", schema=schema, namespace=["test_ns"])
# Rename should raise NotImplementedError
with pytest.raises(NotImplementedError, match="rename_table is not supported"):
@@ -478,9 +182,12 @@ class TestNamespaceConnection:
def test_drop_all_tables(self):
"""Test dropping all tables through namespace."""
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
# Create multiple tables
# Create a child namespace first
db.create_namespace(["test_ns"])
# Create multiple tables in child namespace
schema = pa.schema(
[
pa.field("id", pa.int64()),
@@ -488,27 +195,30 @@ class TestNamespaceConnection:
]
)
for i in range(3):
db.create_table(f"table{i}", schema=schema)
db.create_table(f"table{i}", schema=schema, namespace=["test_ns"])
# Verify tables exist
assert len(list(db.table_names())) == 3
# Verify tables exist in child namespace
assert len(list(db.table_names(namespace=["test_ns"]))) == 3
# Drop all tables
db.drop_all_tables()
# Drop all tables in child namespace
db.drop_all_tables(namespace=["test_ns"])
# Verify all tables are gone
assert len(list(db.table_names())) == 0
# Verify all tables are gone from child namespace
assert len(list(db.table_names(namespace=["test_ns"]))) == 0
# Test that table_names works with keyword-only namespace parameter
db.create_table("test_table", schema=schema)
result = list(db.table_names(namespace=[]))
db.create_table("test_table", schema=schema, namespace=["test_ns"])
result = list(db.table_names(namespace=["test_ns"]))
assert "test_table" in result
def test_table_operations(self):
"""Test various table operations through namespace."""
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
# Create a table with schema
# Create a child namespace first
db.create_namespace(["test_ns"])
# Create a table with schema in child namespace
schema = pa.schema(
[
pa.field("id", pa.int64()),
@@ -516,7 +226,7 @@ class TestNamespaceConnection:
pa.field("text", pa.string()),
]
)
table = db.create_table("test_table", schema=schema)
table = db.create_table("test_table", schema=schema, namespace=["test_ns"])
# Verify empty table was created
result = table.to_pandas()
@@ -548,7 +258,7 @@ class TestNamespaceConnection:
# Connect with storage options
storage_opts = {"test_option": "test_value"}
db = lancedb.connect_namespace(
"temp", {"root": self.temp_dir}, storage_options=storage_opts
"dir", {"root": self.temp_dir}, storage_options=storage_opts
)
# Storage options should be preserved
@@ -566,7 +276,7 @@ class TestNamespaceConnection:
def test_namespace_operations(self):
"""Test namespace management operations."""
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
# Initially no namespaces
assert len(list(db.list_namespaces())) == 0
@@ -617,7 +327,7 @@ class TestNamespaceConnection:
def test_namespace_with_tables_cannot_be_dropped(self):
"""Test that namespaces containing tables cannot be dropped."""
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
# Create namespace and table
db.create_namespace(["test_namespace"])
@@ -630,7 +340,7 @@ class TestNamespaceConnection:
db.create_table("test_table", schema=schema, namespace=["test_namespace"])
# Try to drop namespace with tables - should fail
with pytest.raises(RuntimeError, match="contains tables"):
with pytest.raises(RuntimeError, match="is not empty"):
db.drop_namespace(["test_namespace"])
# Drop table first
@@ -640,7 +350,7 @@ class TestNamespaceConnection:
db.drop_namespace(["test_namespace"])
def test_same_table_name_different_namespaces(self):
db = lancedb.connect_namespace("temp", {"root": self.temp_dir})
db = lancedb.connect_namespace("dir", {"root": self.temp_dir})
# Create two namespaces
db.create_namespace(["namespace_a"])