Compare commits

..

1 Commits

Author SHA1 Message Date
Daniel Rammer
c6f9607820 feat(remote): implement set/unset_lsm_write_spec REST variant
Wire RemoteTable::set_lsm_write_spec and unset_lsm_write_spec to the
sophon REST endpoints (lancedb/sophon#6181) instead of returning
NotSupported. set_lsm_write_spec maps the LsmWriteSpec onto sophon's
request DTO (mode-tagged sharding, maintained_indexes,
writer_config_defaults) and POSTs to /set_lsm_write_spec; unset_lsm_write_spec
POSTs to /unset_lsm_write_spec.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-04 22:07:53 -05:00
31 changed files with 394 additions and 1113 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.30.1-beta.0"
current_version = "0.30.0-beta.1"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

346
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -13,20 +13,20 @@ categories = ["database-implementations"]
rust-version = "1.91.0"
[workspace.dependencies]
lance = { "version" = "=7.2.0-beta.3", default-features = false, "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=7.2.0-beta.3", default-features = false, "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=7.2.0-beta.3", default-features = false, "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=7.2.0-beta.3", "tag" = "v7.2.0-beta.3", "git" = "https://github.com/lance-format/lance.git" }
lance = { "version" = "=7.2.0-beta.1", default-features = false, "tag" = "v7.2.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=7.2.0-beta.1", "tag" = "v7.2.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=7.2.0-beta.1", "tag" = "v7.2.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=7.2.0-beta.1", "tag" = "v7.2.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=7.2.0-beta.1", default-features = false, "tag" = "v7.2.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=7.2.0-beta.1", "tag" = "v7.2.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=7.2.0-beta.1", "tag" = "v7.2.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=7.2.0-beta.1", "tag" = "v7.2.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=7.2.0-beta.1", default-features = false, "tag" = "v7.2.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=7.2.0-beta.1", "tag" = "v7.2.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=7.2.0-beta.1", "tag" = "v7.2.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=7.2.0-beta.1", "tag" = "v7.2.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=7.2.0-beta.1", "tag" = "v7.2.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=7.2.0-beta.1", "tag" = "v7.2.0-beta.1", "git" = "https://github.com/lance-format/lance.git" }
ahash = "0.8"
# Note that this one does not include pyarrow
arrow = { version = "58.0.0", optional = false }

View File

@@ -14,7 +14,7 @@ Add the following dependency to your `pom.xml`:
<dependency>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-core</artifactId>
<version>0.30.1-beta.0</version>
<version>0.30.0-beta.1</version>
</dependency>
```

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.30.1-beta.0</version>
<version>0.30.0-beta.1</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.30.1-beta.0</version>
<version>0.30.0-beta.1</version>
<packaging>pom</packaging>
<name>${project.artifactId}</name>
<description>LanceDB Java SDK Parent POM</description>

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.30.1-beta.0"
version = "0.30.0-beta.1"
publish = false
license.workspace = true
description.workspace = true

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.30.1-beta.0",
"version": "0.30.0-beta.1",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.30.1-beta.0",
"version": "0.30.0-beta.1",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.30.1-beta.0",
"version": "0.30.0-beta.1",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.30.1-beta.0",
"version": "0.30.0-beta.1",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.30.1-beta.0",
"version": "0.30.0-beta.1",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.30.1-beta.0",
"version": "0.30.0-beta.1",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.30.1-beta.0",
"version": "0.30.0-beta.1",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.30.1-beta.0",
"version": "0.30.0-beta.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.30.1-beta.0",
"version": "0.30.0-beta.1",
"cpu": [
"x64",
"arm64"

View File

@@ -11,7 +11,7 @@
"ann"
],
"private": false,
"version": "0.30.1-beta.0",
"version": "0.30.0-beta.1",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.33.1-beta.0"
current_version = "0.33.0-beta.1"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.33.1-beta.0"
version = "0.33.0-beta.1"
publish = false
edition.workspace = true
description = "Python bindings for LanceDB"

View File

@@ -315,15 +315,6 @@ def deserialize_conn(
manifest_enabled=parsed.get("manifest_enabled", False),
namespace_client_properties=parsed.get("namespace_client_properties"),
)
elif connection_type == "remote":
return RemoteDBConnection(
parsed["db_url"],
parsed["api_key"],
parsed.get("region", "us-east-1"),
host_override=parsed.get("host_override"),
client_config=parsed.get("client_config"),
storage_options=storage_options,
)
else:
raise ValueError(f"Unknown connection_type: {connection_type}")

View File

@@ -217,7 +217,6 @@ class Table:
async def uri(self) -> str: ...
async def initial_storage_options(self) -> Optional[Dict[str, str]]: ...
async def latest_storage_options(self) -> Optional[Dict[str, str]]: ...
async def _table_reopen_state(self) -> Dict[str, Any]: ...
async def set_unenforced_primary_key(self, columns: List[str]) -> None: ...
async def set_lsm_write_spec(self, spec: LsmWriteSpec) -> None: ...
async def unset_lsm_write_spec(self) -> None: ...

View File

@@ -3,13 +3,12 @@
import copy
import json
import os
from deprecation import deprecated
import pyarrow as pa
from ._lancedb import async_permutation_builder, PermutationReader
from .table import LanceTable, Table
from .table import LanceTable
from .background_loop import LOOP
from .util import batch_to_tensor, batch_to_tensor_rows
from typing import Any, Callable, Iterator, Literal, Optional, TYPE_CHECKING, Union
@@ -355,41 +354,6 @@ class Transforms:
DEFAULT_BATCH_SIZE = 100
def _table_to_pickle_state(table: Table) -> dict[str, Any]:
from .remote.table import RemoteTable
if isinstance(table, LanceTable) and table._conn.uri.startswith("memory://"):
return {
"kind": "memory",
"name": table.name,
"data": table.to_arrow(),
}
if isinstance(table, (LanceTable, RemoteTable)):
return {
"kind": "table",
"table": table,
}
raise ValueError(f"Cannot pickle table of type {type(table)!r}")
def _table_from_pickle_state(state: dict[str, Any]) -> Table:
from . import connect
kind = state["kind"]
if kind == "table":
return state["table"]
if kind == "remote":
return state["table"]
if kind == "memory":
return connect("memory://").create_table(state["name"], state["data"])
if kind == "local":
db = connect(state["uri"], storage_options=state["storage_options"])
return db.open_table(state["name"], namespace_path=state["namespace"] or None)
raise ValueError(f"Unknown table pickle state kind: {kind}")
class Permutation:
"""
A Permutation is a view of a dataset that can be used as input to model training
@@ -405,15 +369,15 @@ class Permutation:
def __init__(
self,
base_table: Table,
permutation_table: Optional[Table],
base_table: LanceTable,
permutation_table: Optional[LanceTable],
split: int,
selection: dict[str, str],
batch_size: int,
transform_fn: Callable[pa.RecordBatch, Any],
offset: Optional[int] = None,
limit: Optional[int] = None,
connection_factory: Optional[Callable[[str], Table]] = None,
connection_factory: Optional[Callable[[str], LanceTable]] = None,
_reader: Optional[PermutationReader] = None,
):
"""
@@ -433,7 +397,6 @@ class Permutation:
if _reader is None:
_reader = LOOP.run(self._build_reader())
self.reader: PermutationReader = _reader
self._pid = os.getpid()
async def _build_reader(self) -> PermutationReader:
reader = await PermutationReader.from_tables(
@@ -465,25 +428,29 @@ class Permutation:
return new
def with_connection_factory(
self, connection_factory: Callable[[str], Table]
self, connection_factory: Callable[[str], LanceTable]
) -> "Permutation":
"""
Creates a new permutation that will use ``connection_factory`` to reopen
the base table when this permutation is unpickled in a worker process.
The factory is a callable that takes a single argument, the base table
name, and returns a LanceDB table. It must be picklable; the worker
The factory is a callable that takes a single argument the base table
name and returns a [LanceTable]. It must be picklable; the worker
will pickle it via standard ``pickle`` and call it to recover the base
table. Picklable callables in practice means top-level (module-level)
functions, ``functools.partial`` of such functions, or instances of
picklable classes implementing ``__call__``. Lambdas and closures over
local variables don't pickle with the default protocol.
A factory is optional for normal local and remote LanceDB connections:
if not set, ``__getstate__`` captures the table's own picklable reopen
state. Use a factory when that default state is not enough, for example
when credentials should be loaded from the worker environment instead
of being embedded in the pickle.
Setting a factory is necessary when the URI alone is not enough to
re-open the connection — most importantly for LanceDB Cloud (``db://``)
connections, where ``api_key`` and ``region`` aren't recoverable from
the connection object after construction.
For local file or cloud-storage paths the factory is optional: if not
set, ``__getstate__`` falls back to capturing
``(uri, storage_options, namespace_path)`` and re-opening via
``lancedb.connect(uri, storage_options=...)``.
Examples
--------
@@ -541,7 +508,7 @@ class Permutation:
return new
@classmethod
def identity(cls, table: Table) -> "Permutation":
def identity(cls, table: LanceTable) -> "Permutation":
"""
Creates an identity permutation for the given table.
"""
@@ -550,8 +517,8 @@ class Permutation:
@classmethod
def from_tables(
cls,
base_table: Table,
permutation_table: Optional[Table] = None,
base_table: LanceTable,
permutation_table: Optional[LanceTable] = None,
split: Optional[Union[str, int]] = None,
) -> "Permutation":
"""
@@ -627,10 +594,11 @@ class Permutation:
The base table is captured either via a user-supplied
``connection_factory`` (see [with_connection_factory]) or, as a
fallback, by the table's own picklable reopen state. The permutation
table is captured as a pyarrow Table (which pickles via Arrow IPC
natively). The reader is dropped from the wire format and rebuilt
lazily on first use.
fallback, by introspecting ``(uri, storage_options, namespace_path)``
on the connection. The permutation table — always an in-memory
LanceDB table — is captured as a pyarrow Table (which pickles via
Arrow IPC natively). The reader is dropped from the wire format;
``__setstate__`` rebuilds it from the restored tables.
"""
permutation_data: Optional[pa.Table] = None
if self.permutation_table is not None:
@@ -654,9 +622,39 @@ class Permutation:
# namespace from the existing connection.
return common
# URI-introspection fallback: only viable for native (OSS) connections
# where (uri, storage_options) is enough to reopen. Remote / cloud
# connections don't expose recoverable api_key / region — those users
# must call with_connection_factory().
try:
base_uri = self.base_table._conn.uri
storage_options = self.base_table._conn.storage_options
except AttributeError as e:
raise ValueError(
"Cannot pickle this Permutation: the base table's connection "
"does not expose a uri/storage_options, which usually means it "
"is a remote (LanceDB Cloud) connection. Call "
"Permutation.with_connection_factory(...) first to provide a "
"picklable callable that re-opens the base table from a worker "
"process."
) from e
if base_uri.startswith("memory://"):
# In-memory base tables don't exist in any worker process by
# default, so dump the entire base table into the pickle. This
# can be expensive for large datasets — users with large
# in-memory base tables should either persist them or set a
# connection_factory.
return {
**common,
"base_table_data": self.base_table.to_arrow(),
}
return {
**common,
"base_table_state": _table_to_pickle_state(self.base_table),
"base_table_uri": base_uri,
"base_table_namespace": self.base_table._namespace_path,
"base_table_storage_options": storage_options,
}
def __setstate__(self, state: dict[str, Any]) -> None:
@@ -665,8 +663,6 @@ class Permutation:
connection_factory = state["connection_factory"]
if connection_factory is not None:
base_table = connection_factory(state["base_table_name"])
elif "base_table_state" in state:
base_table = _table_from_pickle_state(state["base_table_state"])
elif "base_table_data" in state:
# In-memory base table inlined into the pickle; rebuild the same
# way we rebuild the in-memory permutation table.
@@ -684,7 +680,7 @@ class Permutation:
namespace_path=state["base_table_namespace"] or None,
)
permutation_table: Optional[Table] = None
permutation_table: Optional[LanceTable] = None
if state["permutation_data"] is not None:
mem_db = connect("memory://")
permutation_table = mem_db.create_table(
@@ -700,28 +696,10 @@ class Permutation:
self.offset = state["offset"]
self.limit = state["limit"]
self.connection_factory = connection_factory
self.reader = None
self._pid = None
def _ensure_open(self) -> None:
pid = os.getpid()
if self.reader is not None and getattr(self, "_pid", None) == pid:
return
# The reader owns Rust-side table handles. Rebuild it after unpickle or
# fork even though the Python table wrappers reopen themselves.
if hasattr(self.base_table, "_ensure_open"):
self.base_table._ensure_open()
if self.permutation_table is not None and hasattr(
self.permutation_table, "_ensure_open"
):
self.permutation_table._ensure_open()
self.reader = LOOP.run(self._build_reader())
self._pid = pid
@property
def schema(self) -> pa.Schema:
self._ensure_open()
async def do_output_schema():
return await self.reader.output_schema(self.selection)
@@ -739,7 +717,6 @@ class Permutation:
"""
The number of rows in the permutation
"""
self._ensure_open()
return self.reader.count_rows()
@property
@@ -898,7 +875,6 @@ class Permutation:
If skip_last_batch is True, the last batch will be skipped if it is not a
multiple of batch_size.
"""
self._ensure_open()
async def get_iter():
return await self.reader.read(self.selection, batch_size=batch_size)
@@ -1000,7 +976,6 @@ class Permutation:
so `with_format` and `with_transform` affect this method in the same way
they affect iteration.
"""
self._ensure_open()
async def do_take_offsets():
return await self.reader.take_offsets(offsets, selection=self.selection)
@@ -1036,11 +1011,9 @@ class Permutation:
"""
Skip the first `skip` rows of the permutation
"""
self._ensure_open()
new = copy.copy(self)
new.offset = skip
new.reader = LOOP.run(new._build_reader())
new._pid = os.getpid()
return new
@deprecated(details="Use with_take instead")
@@ -1059,11 +1032,9 @@ class Permutation:
"""
Limit the permutation to `limit` rows (following any `skip`)
"""
self._ensure_open()
new = copy.copy(self)
new.limit = limit
new.reader = LOOP.run(new._build_reader())
new._pid = os.getpid()
return new
@deprecated(details="Use with_repeat instead")

View File

@@ -3,7 +3,6 @@
from datetime import timedelta
import json
import logging
from concurrent.futures import ThreadPoolExecutor
import sys
@@ -18,7 +17,7 @@ else:
# Remove this import to fix circular dependency
# from lancedb import connect_async
from lancedb.remote import ClientConfig, RetryConfig, TimeoutConfig, TlsConfig
from lancedb.remote import ClientConfig
import pyarrow as pa
from ..common import DATA
@@ -37,64 +36,6 @@ from ..table import Table
from ..util import validate_table_name
def _duration_seconds(value: Optional[timedelta]) -> Optional[float]:
return value.total_seconds() if value is not None else None
def _timeout_config_to_dict(
config: Optional[TimeoutConfig],
) -> Optional[dict[str, Any]]:
if config is None:
return None
return {
"timeout": _duration_seconds(config.timeout),
"connect_timeout": _duration_seconds(config.connect_timeout),
"read_timeout": _duration_seconds(config.read_timeout),
"pool_idle_timeout": _duration_seconds(config.pool_idle_timeout),
}
def _retry_config_to_dict(config: RetryConfig) -> dict[str, Any]:
return {
"retries": config.retries,
"connect_retries": config.connect_retries,
"read_retries": config.read_retries,
"backoff_factor": config.backoff_factor,
"backoff_jitter": config.backoff_jitter,
"statuses": config.statuses,
}
def _tls_config_to_dict(config: Optional[TlsConfig]) -> Optional[dict[str, Any]]:
if config is None:
return None
return {
"cert_file": config.cert_file,
"key_file": config.key_file,
"ssl_ca_cert": config.ssl_ca_cert,
"assert_hostname": config.assert_hostname,
}
def _client_config_to_dict(config: ClientConfig) -> dict[str, Any]:
if config.header_provider is not None:
raise ValueError(
"Cannot serialize a remote connection with a header_provider. "
"Use static api_key/extra_headers or provide a worker-side "
"connection factory instead."
)
return {
"user_agent": config.user_agent,
"retry_config": _retry_config_to_dict(config.retry_config),
"timeout_config": _timeout_config_to_dict(config.timeout_config),
"extra_headers": config.extra_headers,
"id_delimiter": config.id_delimiter,
"tls_config": _tls_config_to_dict(config.tls_config),
"header_provider": None,
"user_id": config.user_id,
}
class RemoteDBConnection(DBConnection):
"""A connection to a remote LanceDB database."""
@@ -148,11 +89,6 @@ class RemoteDBConnection(DBConnection):
parsed = urlparse(db_url)
if parsed.scheme != "db":
raise ValueError(f"Invalid scheme: {parsed.scheme}, only accepts db://")
self.db_url = db_url
self.api_key = api_key
self.region = region
self.host_override = host_override
self.storage_options = storage_options
self.db_name = parsed.netloc
self.client_config = client_config
@@ -175,20 +111,6 @@ class RemoteDBConnection(DBConnection):
def __repr__(self) -> str:
return f"RemoteConnect(name={self.db_name})"
@override
def serialize(self) -> str:
return json.dumps(
{
"connection_type": "remote",
"db_url": self.db_url,
"api_key": self.api_key,
"region": self.region,
"host_override": self.host_override,
"client_config": _client_config_to_dict(self.client_config),
"storage_options": self.storage_options,
}
)
@override
def list_namespaces(
self,
@@ -409,12 +331,7 @@ class RemoteDBConnection(DBConnection):
)
table = LOOP.run(self._conn.open_table(name, namespace_path=namespace_path))
return RemoteTable(
table,
self.db_name,
connection_state=self.serialize,
namespace_path=namespace_path,
)
return RemoteTable(table, self.db_name)
def clone_table(
self,
@@ -463,12 +380,7 @@ class RemoteDBConnection(DBConnection):
is_shallow=is_shallow,
)
)
return RemoteTable(
table,
self.db_name,
connection_state=self.serialize,
namespace_path=target_namespace_path,
)
return RemoteTable(table, self.db_name)
@override
def create_table(
@@ -613,12 +525,7 @@ class RemoteDBConnection(DBConnection):
fill_value=fill_value,
)
)
return RemoteTable(
table,
self.db_name,
connection_state=self.serialize,
namespace_path=namespace_path,
)
return RemoteTable(table, self.db_name)
@override
def drop_table(self, name: str, namespace_path: Optional[List[str]] = None):

View File

@@ -5,7 +5,6 @@ from datetime import timedelta
import deprecation
import logging
from functools import cached_property
import os
from typing import (
Any,
Callable,
@@ -64,103 +63,14 @@ class RemoteTable(Table):
self,
table: AsyncTable,
db_name: str,
*,
connection_state: Optional[Union[str, Callable[[], str]]] = None,
namespace_path: Optional[List[str]] = None,
):
self._table_handle = table
self._name = table.name
self._table = table
self.db_name = db_name
self._connection_state = connection_state
self._namespace_path = list(namespace_path or [])
self._checkout_version: Optional[int] = None
self._table_state: Optional[dict[str, Any]] = None
self._pid = os.getpid()
def _serialized_connection_state(self) -> str:
if self._connection_state is None:
raise RuntimeError(
"Cannot reopen this remote table because it does not carry "
"serialized connection state"
)
if callable(self._connection_state):
self._connection_state = self._connection_state()
return self._connection_state
def _reopen_state(self) -> dict[str, Any]:
if self._table_state is not None:
return self._table_state
self._table_state = {
"name": self._name,
"namespace_path": self._namespace_path,
"storage_options": None,
}
return self._table_state
@property
def _table(self) -> AsyncTable:
self._ensure_open()
assert self._table_handle is not None
return self._table_handle
@_table.setter
def _table(self, table: AsyncTable) -> None:
self._table_handle = table
self._name = table.name
self._table_state = None
self._pid = os.getpid()
def _ensure_open(self) -> None:
pid = os.getpid()
if self._table_handle is not None and self._pid == pid:
return
# Pickle clears the handle; fork inherits a handle created in the
# parent process. In both cases reopen before touching the Rust client.
from lancedb import deserialize_conn
db = deserialize_conn(self._serialized_connection_state(), for_worker=True)
table_state = self._reopen_state()
table = db.open_table(
table_state["name"],
namespace_path=table_state["namespace_path"] or None,
)
if self._checkout_version is not None:
table.checkout(self._checkout_version)
self._table_handle = table._table
self.db_name = table.db_name
self._pid = pid
def __getstate__(self) -> dict:
return {
"connection_state": self._serialized_connection_state(),
"db_name": self.db_name,
"table_state": self._reopen_state(),
"checkout_version": self._checkout_version,
}
def __setstate__(self, state: dict) -> None:
self._table_handle = None
table_state = state.get("table_state")
if table_state is None:
table_state = {
"name": state["name"],
"namespace_path": state["namespace_path"],
"storage_options": None,
}
self._table_state = table_state
self._name = table_state["name"]
self.db_name = state["db_name"]
self._connection_state = state["connection_state"]
self._namespace_path = table_state["namespace_path"]
self._checkout_version = state["checkout_version"]
self._pid = None
@property
def name(self) -> str:
"""The name of the table"""
return self._name
return self._table.name
def __repr__(self) -> str:
return f"RemoteTable({self.db_name}.{self.name})"
@@ -210,19 +120,13 @@ class RemoteTable(Table):
raise NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
def checkout(self, version: Union[int, str]):
result = LOOP.run(self._table.checkout(version))
self._checkout_version = self.version
return result
return LOOP.run(self._table.checkout(version))
def checkout_latest(self):
result = LOOP.run(self._table.checkout_latest())
self._checkout_version = None
return result
return LOOP.run(self._table.checkout_latest())
def restore(self, version: Optional[Union[int, str]] = None):
result = LOOP.run(self._table.restore(version))
self._checkout_version = None
return result
return LOOP.run(self._table.restore(version))
def list_indices(self) -> Iterable[IndexConfig]:
"""List all the indices on the table"""

View File

@@ -5,7 +5,6 @@ from __future__ import annotations
import asyncio
import inspect
import os
import deprecation
import warnings
from abc import ABC, abstractmethod
@@ -758,12 +757,8 @@ class Table(ABC):
"""
raise NotImplementedError
def _ensure_open(self) -> None:
pass
def __len__(self) -> int:
"""The number of rows in this Table"""
self._ensure_open()
return self.count_rows(None)
@property
@@ -1409,7 +1404,6 @@ class Table(ABC):
pa.RecordBatch
A record batch containing the rows at the given offsets.
"""
self._ensure_open()
# We don't know the order of the results at all. So we calculate a permutation
# for ordering the given offsets. Then we load the data with the _rowoffset
# column. Then we sort by _rowoffset and apply the inverse of the permutation
@@ -1968,7 +1962,6 @@ class LanceTable(Table):
self._location = location # Store location for use in _dataset_path
self._namespace_client = namespace_client
self._pushdown_operations = pushdown_operations or set()
self._init_reopen_tracking()
if _async is not None:
self._table = _async
else:
@@ -1984,66 +1977,6 @@ class LanceTable(Table):
)
)
def _init_reopen_tracking(self) -> None:
self._checkout_version: Optional[int] = None
self._table_state: Optional[dict[str, Any]] = None
self._pid = os.getpid()
def _reopen_state(self) -> dict[str, Any]:
state = LOOP.run(self._table._table_reopen_state())
if get_uri_scheme(self._conn.uri) == "memory":
raise ValueError(
"Cannot pickle an in-memory LanceTable. Use a persisted table "
"or provide a worker-side connection factory."
)
return state
def _copy_reopened_table(self, table: "LanceTable") -> None:
self._conn = table._conn
self._namespace_path = table._namespace_path
self._location = table._location
self._namespace_client = table._namespace_client
self._pushdown_operations = table._pushdown_operations
self._table = table._table
self._pid = os.getpid()
def _ensure_open(self) -> None:
pid = os.getpid()
if getattr(self, "_table", None) is not None and self._pid == pid:
return
if self._table_state is None:
self._table_state = self._reopen_state()
table = self._conn.open_table(
self._table_state["name"],
namespace_path=self._table_state["namespace_path"] or None,
storage_options=self._table_state["storage_options"],
)
if self._checkout_version is not None:
table.checkout(self._checkout_version)
self._copy_reopened_table(table)
def __getstate__(self) -> dict[str, Any]:
return {
"connection_state": self._conn.serialize(),
"table_state": self._reopen_state(),
"checkout_version": self._checkout_version,
}
def __setstate__(self, state: dict[str, Any]) -> None:
from . import deserialize_conn
self._conn = deserialize_conn(state["connection_state"], for_worker=True)
self._namespace_path = list(state["table_state"]["namespace_path"] or [])
self._location = None
self._namespace_client = None
self._pushdown_operations = set()
self._checkout_version = state["checkout_version"]
self._table_state = state["table_state"]
self._table = None
self._pid = None
self._ensure_open()
@property
def name(self) -> str:
return self._table.name
@@ -2247,7 +2180,6 @@ class LanceTable(Table):
0 [1.1, 0.9] vector
"""
LOOP.run(self._table.checkout(version))
self._checkout_version = self.version
def checkout_latest(self):
"""Checkout the latest version of the table. This is an in-place operation.
@@ -2256,7 +2188,6 @@ class LanceTable(Table):
version of the table.
"""
LOOP.run(self._table.checkout_latest())
self._checkout_version = None
def restore(self, version: Optional[Union[int, str]] = None):
"""Restore a version of the table. This is an in-place operation.
@@ -2305,7 +2236,6 @@ class LanceTable(Table):
if version is not None:
LOOP.run(self._table.checkout(version))
LOOP.run(self._table.restore())
self._checkout_version = None
def count_rows(self, filter: Optional[str] = None) -> int:
return LOOP.run(self._table.count_rows(filter))
@@ -3364,7 +3294,6 @@ class LanceTable(Table):
self._location = location
self._namespace_client = namespace_client
self._pushdown_operations = pushdown_operations or set()
self._init_reopen_tracking()
if data_storage_version is not None:
warnings.warn(
@@ -4622,10 +4551,6 @@ class AsyncTable:
"""
return await self._inner.latest_storage_options()
async def _table_reopen_state(self) -> dict[str, Any]:
"""Get the Rust-side table state needed to reopen this table."""
return await self._inner._table_reopen_state()
async def add(
self,
data: DATA,

View File

@@ -1,13 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
import re
from concurrent.futures import ThreadPoolExecutor
import contextlib
from datetime import timedelta
import http.server
import json
import multiprocessing as mp
import pickle
import re
import sys
import threading
import time
@@ -172,196 +171,6 @@ def test_table_len_sync():
assert len(table) == 1
def test_remote_connection_serializes():
def handler(request):
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(b'{"tables": []}')
with mock_lancedb_connection(handler) as db:
serialized = json.loads(db.serialize())
assert isinstance(serialized["client_config"], dict)
restored = lancedb.deserialize_conn(db.serialize())
assert restored.table_names() == []
def test_remote_table_is_picklable():
def handler(request):
request.close_connection = True
if request.path == "/v1/table/test/describe/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
payload = json.dumps(
{
"version": 1,
"schema": {
"fields": [
{"name": "id", "type": {"type": "int64"}, "nullable": False}
]
},
}
)
request.wfile.write(payload.encode())
elif request.path == "/v1/table/test/count_rows/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(b"3")
else:
request.send_response(404)
request.end_headers()
with mock_lancedb_connection(handler) as db:
table = db.open_table("test")
state = table.__getstate__()
assert state["table_state"] == {
"name": "test",
"namespace_path": [],
"storage_options": None,
}
restored = pickle.loads(pickle.dumps(table))
assert restored.count_rows() == 3
def test_remote_table_reopens_when_pid_changes_without_cached_state():
def handler(request):
request.close_connection = True
if request.path == "/v1/table/test/describe/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
payload = json.dumps(
{
"version": 1,
"schema": {
"fields": [
{"name": "id", "type": {"type": "int64"}, "nullable": False}
]
},
}
)
request.wfile.write(payload.encode())
elif request.path == "/v1/table/test/count_rows/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(b"3")
else:
request.send_response(404)
request.end_headers()
with mock_lancedb_connection(handler) as db:
table = db.open_table("test")
table._pid = -1
table._table_state = None
assert table.count_rows() == 3
def test_remote_table_open_does_not_require_picklable_client_config():
from lancedb.remote import HeaderProvider
class LocalHeaderProvider(HeaderProvider):
def get_headers(self):
return {"X-Test-Header": "present"}
def handler(request):
request.close_connection = True
assert request.headers.get("X-Test-Header") == "present"
if request.path == "/v1/table/test/describe/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(b'{"version": 1, "schema": {"fields": []}}')
elif request.path == "/v1/table/test/count_rows/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(b"3")
else:
request.send_response(404)
request.end_headers()
with http.server.HTTPServer(
("localhost", 0), make_mock_http_handler(handler)
) as server:
port = server.server_address[1]
handle = threading.Thread(target=server.serve_forever)
handle.start()
try:
db = lancedb.connect(
"db://dev",
api_key="fake",
host_override=f"http://localhost:{port}",
client_config={
"retry_config": {"retries": 0},
"timeout_config": {"connect_timeout": 2, "read_timeout": 2},
"header_provider": LocalHeaderProvider(),
},
)
table = db.open_table("test")
assert table.count_rows() == 3
with pytest.raises(ValueError, match="header_provider"):
pickle.dumps(table)
finally:
server.shutdown()
handle.join()
def test_remote_permutation_is_picklable():
from lancedb.permutation import Permutation
rows = list(range(10))
def handler(request):
request.close_connection = True
if request.path == "/v1/table/test/describe/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
payload = json.dumps(
{
"version": 1,
"schema": {
"fields": [
{"name": "a", "type": {"type": "int64"}, "nullable": False}
]
},
}
)
request.wfile.write(payload.encode())
elif request.path == "/v1/table/test/count_rows/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(str(len(rows)).encode())
elif request.path == "/v1/table/test/query/":
content_len = int(request.headers.get("Content-Length"))
body = json.loads(request.rfile.read(content_len))
if "filter" in body:
match = re.search(r"_rowoffset in \((.*?)\)", body["filter"])
offsets = [int(offset.strip()) for offset in match.group(1).split(",")]
else:
offsets = rows
table = pa.table({"a": [rows[offset] for offset in offsets]})
request.send_response(200)
request.send_header("Content-Type", "application/vnd.apache.arrow.file")
request.end_headers()
with pa.ipc.new_file(request.wfile, schema=table.schema) as writer:
writer.write_table(table)
else:
request.send_response(404)
request.end_headers()
with mock_lancedb_connection(handler) as db:
permutation = Permutation.identity(db.open_table("test"))
restored = pickle.loads(pickle.dumps(permutation))
assert restored.__getitems__([0, 2, 4]) == [{"a": 0}, {"a": 2}, {"a": 4}]
def test_create_table_exist_ok():
def handler(request):
if request.path == "/v1/table/test/create/?mode=exist_ok":
@@ -1591,10 +1400,6 @@ def _remote_fork_child(port: int, queue) -> None:
queue.put(db.table_names())
def _remote_table_fork_child(table, queue) -> None:
queue.put(table.count_rows())
@pytest.mark.skipif(
sys.platform != "linux",
reason=(
@@ -1657,65 +1462,3 @@ def test_remote_connection_after_fork():
finally:
server.shutdown()
server_thread.join()
@pytest.mark.skipif(
sys.platform != "linux",
reason=(
"fork() is unavailable on Windows and unsafe on macOS "
"(Apple frameworks/TLS are not fork-safe)"
),
)
def test_inherited_remote_table_reopens_after_fork():
def handler(request):
if request.path == "/v1/table/test/describe/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(b'{"version": 1, "schema": {"fields": []}}')
elif request.path == "/v1/table/test/count_rows/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(b"7")
else:
request.send_response(404)
request.end_headers()
server = http.server.HTTPServer(("localhost", 0), make_mock_http_handler(handler))
port = server.server_address[1]
server_thread = threading.Thread(target=server.serve_forever)
server_thread.start()
try:
db = lancedb.connect(
"db://dev",
api_key="fake",
host_override=f"http://localhost:{port}",
client_config={
"retry_config": {"retries": 0},
"timeout_config": {"connect_timeout": 2, "read_timeout": 2},
},
)
table = db.open_table("test")
assert table.count_rows() == 7
ctx = mp.get_context("fork")
queue = ctx.Queue()
proc = ctx.Process(target=_remote_table_fork_child, args=(table, queue))
proc.start()
proc.join(timeout=15)
if proc.is_alive():
proc.terminate()
proc.join(timeout=5)
if proc.is_alive():
proc.kill()
proc.join()
pytest.fail("Remote table hung after fork")
assert proc.exitcode == 0, f"child exited with code {proc.exitcode}"
assert not queue.empty(), "child produced no result"
assert queue.get() == 7
finally:
server.shutdown()
server_thread.join()

View File

@@ -3,7 +3,6 @@
import os
import pickle
import sys
import warnings
from datetime import date, datetime, timedelta
@@ -49,36 +48,6 @@ def test_basic(mem_db: DBConnection):
assert table.to_arrow() == expected_data
def test_lance_table_is_picklable(tmp_db: DBConnection):
table = tmp_db.create_table("pickle_table", pa.table({"id": [1, 2, 3]}))
restored = pickle.loads(pickle.dumps(table))
assert restored.name == "pickle_table"
assert restored.count_rows() == 3
assert restored.to_arrow().column("id").to_pylist() == [1, 2, 3]
def test_lance_table_pickle_preserves_checkout(tmp_db: DBConnection):
table = tmp_db.create_table("pickle_checkout", pa.table({"id": [1]}))
table.add(pa.table({"id": [2]}))
table.checkout(1)
restored = pickle.loads(pickle.dumps(table))
assert restored.count_rows() == 1
assert restored.to_arrow().column("id").to_pylist() == [1]
restored.checkout_latest()
assert restored.count_rows() == 2
def test_memory_lance_table_pickle_is_unsupported(mem_db: DBConnection):
table = mem_db.create_table("memory_pickle", pa.table({"id": [1]}))
with pytest.raises(ValueError, match="in-memory LanceTable"):
pickle.dumps(table)
def test_table_to_pandas_default_matches_arrow(tmp_db: DBConnection):
pd = pytest.importorskip("pandas")
data = pa.table({"id": [1, 2], "text": ["one", "two"]})

View File

@@ -1,15 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
import contextlib
import functools
import http.server
import json
import multiprocessing as mp
import pickle
import re
import sys
import threading
import lancedb
import pyarrow as pa
@@ -20,107 +15,6 @@ from lancedb.util import tbl_to_tensor
torch = pytest.importorskip("torch")
REMOTE_ROWS = list(range(100))
def _make_mock_http_handler(handler):
class MockLanceDBHandler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
handler(self)
def do_POST(self):
handler(self)
return MockLanceDBHandler
def _remote_schema_payload():
return {
"version": 1,
"schema": {
"fields": [
{"name": "a", "type": {"type": "int64"}, "nullable": False},
]
},
}
def _offsets_from_filter(filter_sql: str | None) -> list[int]:
if filter_sql is None:
return REMOTE_ROWS
match = re.search(r"_rowoffset in \((.*?)\)", filter_sql)
if match is None:
return REMOTE_ROWS
raw_offsets = match.group(1).strip()
if raw_offsets == "":
return []
return [int(offset.strip()) for offset in raw_offsets.split(",")]
def _remote_dataset_handler(request):
request.close_connection = True
if request.path == "/v1/table/test/describe/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(json.dumps(_remote_schema_payload()).encode())
elif request.path == "/v1/table/test/count_rows/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(str(len(REMOTE_ROWS)).encode())
elif request.path == "/v1/table/test/query/":
content_len = int(request.headers.get("Content-Length"))
body = json.loads(request.rfile.read(content_len))
offsets = _offsets_from_filter(body.get("filter"))
requested_columns = body.get("columns") or ["a"]
if isinstance(requested_columns, dict):
requested_columns = list(requested_columns)
data = {}
for column in requested_columns:
if column == "a":
data[column] = [REMOTE_ROWS[offset] for offset in offsets]
elif column == "_rowoffset":
data[column] = offsets
elif column == "_rowid":
data[column] = offsets
table = pa.table(data)
request.send_response(200)
request.send_header("Content-Type", "application/vnd.apache.arrow.file")
request.end_headers()
with pa.ipc.new_file(request.wfile, schema=table.schema) as writer:
writer.write_table(table)
else:
request.send_response(404)
request.end_headers()
@contextlib.contextmanager
def _remote_dataset_table():
with http.server.ThreadingHTTPServer(
("localhost", 0), _make_mock_http_handler(_remote_dataset_handler)
) as server:
port = server.server_address[1]
handle = threading.Thread(target=server.serve_forever)
handle.start()
try:
db = lancedb.connect(
"db://dev",
api_key="fake",
host_override=f"http://localhost:{port}",
client_config={
"retry_config": {"retries": 0},
"timeout_config": {"connect_timeout": 2, "read_timeout": 2},
},
)
yield db.open_table("test")
finally:
server.shutdown()
handle.join()
def _open_native_table(uri: str, table_name: str):
"""Top-level connection factory used by the explicit-factory pickle test.
@@ -213,39 +107,6 @@ def test_permutation_dataloader_multiprocessing(tmp_db):
assert seen == 1000
def test_remote_table_dataloader_multiprocessing():
with _remote_dataset_table() as table:
dataloader = torch.utils.data.DataLoader(
table,
collate_fn=tbl_to_tensor,
batch_size=10,
num_workers=2,
multiprocessing_context="spawn",
)
seen = 0
for batch in dataloader:
assert batch.size(0) == 1
assert batch.size(1) == 10
seen += batch.size(1)
assert seen == len(REMOTE_ROWS)
def test_remote_permutation_dataloader_multiprocessing():
with _remote_dataset_table() as table:
permutation = Permutation.identity(table)
dataloader = torch.utils.data.DataLoader(
permutation,
batch_size=10,
num_workers=2,
multiprocessing_context="spawn",
)
seen = 0
for batch in dataloader:
assert batch["a"].size(0) == 10
seen += batch["a"].size(0)
assert seen == len(REMOTE_ROWS)
def test_permutation_pickle_with_connection_factory(tmp_path):
"""When the user provides a connection_factory, pickling should round-trip
through that factory rather than introspecting the connection URI. Useful
@@ -310,35 +171,6 @@ def _multiworker_dataloader_target(db_uri: str, result_queue):
result_queue.put(count)
def _remote_multiworker_dataloader_target(port: int, result_queue):
import lancedb
from lancedb.permutation import Permutation
db = lancedb.connect(
"db://dev",
api_key="fake",
host_override=f"http://localhost:{port}",
client_config={
"retry_config": {"retries": 0},
"timeout_config": {"connect_timeout": 2, "read_timeout": 2},
},
)
table = db.open_table("test")
permutation = Permutation.identity(table)
dataloader = torch.utils.data.DataLoader(
permutation,
batch_size=10,
num_workers=2,
multiprocessing_context="fork",
)
count = 0
for batch in dataloader:
assert batch["a"].size(0) == 10
count += 1
result_queue.put(count)
@pytest.mark.skipif(
sys.platform != "linux",
reason=(
@@ -376,46 +208,3 @@ def test_permutation_dataloader_fork_workers(tmp_path):
assert proc.exitcode == 0, f"child exited with code {proc.exitcode}"
assert not queue.empty(), "child produced no batches"
assert queue.get() == 100
@pytest.mark.skipif(
sys.platform != "linux",
reason=(
"fork() is unavailable on Windows and unsafe on macOS "
"(Apple frameworks/TLS are not fork-safe)"
),
)
def test_remote_permutation_dataloader_fork_workers():
with http.server.ThreadingHTTPServer(
("localhost", 0), _make_mock_http_handler(_remote_dataset_handler)
) as server:
port = server.server_address[1]
handle = threading.Thread(target=server.serve_forever)
handle.start()
try:
ctx = mp.get_context("spawn")
queue = ctx.Queue()
proc = ctx.Process(
target=_remote_multiworker_dataloader_target,
args=(port, queue),
)
proc.start()
proc.join(timeout=30)
if proc.is_alive():
proc.terminate()
proc.join(timeout=5)
if proc.is_alive():
proc.kill()
proc.join()
pytest.fail(
"Remote permutation hung when iterated in a fork-based "
"DataLoader worker"
)
assert proc.exitcode == 0, f"child exited with code {proc.exitcode}"
assert not queue.empty(), "child produced no batches"
assert queue.get() == 10
finally:
server.shutdown()
handle.join()

View File

@@ -755,23 +755,6 @@ impl Table {
})
}
pub fn _table_reopen_state(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {
let name = inner.name().to_string();
let namespace_path = inner.namespace().to_vec();
let storage_options = inner.initial_storage_options().await;
Python::attach(|py| {
let dict = PyDict::new(py);
dict.set_item("name", name)?;
dict.set_item("namespace_path", namespace_path)?;
dict.set_item("storage_options", storage_options)?;
Ok(dict.unbind())
})
})
}
pub fn __repr__(&self) -> String {
match &self.inner {
None => format!("ClosedTable({})", self.name),

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.30.1-beta.0"
version = "0.30.0-beta.1"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true

View File

@@ -1826,16 +1826,57 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
})
}
async fn set_lsm_write_spec(&self, _spec: crate::table::LsmWriteSpec) -> Result<()> {
Err(Error::NotSupported {
message: "set_lsm_write_spec is not supported on LanceDB cloud.".into(),
})
async fn set_lsm_write_spec(&self, spec: crate::table::LsmWriteSpec) -> Result<()> {
use crate::table::LsmWriteSpec;
self.check_mutable().await?;
// Map the spec onto the server's request DTO. `sharding` is internally
// tagged on `mode` to mirror sophon's `Sharding` enum; `maintained_indexes`
// and `writer_config_defaults` are sent verbatim (an empty list means "no
// maintained indexes", not "default to all").
let sharding = match &spec {
LsmWriteSpec::Bucket {
column,
num_buckets,
..
} => serde_json::json!({
"mode": "bucket",
"column": column,
"num_buckets": num_buckets,
}),
LsmWriteSpec::Identity { column, .. } => serde_json::json!({
"mode": "identity",
"column": column,
}),
LsmWriteSpec::Unsharded { .. } => serde_json::json!({ "mode": "unsharded" }),
};
let body = serde_json::json!({
"sharding": sharding,
"maintained_indexes": spec.maintained_indexes(),
"writer_config_defaults": spec.writer_config_defaults(),
});
let request = self
.client
.post(&format!(
"/v1/table/{}/set_lsm_write_spec/",
self.identifier
))
.json(&body);
let (request_id, response) = self.send(request, true).await?;
self.check_table_response(&request_id, response).await?;
Ok(())
}
async fn unset_lsm_write_spec(&self) -> Result<()> {
Err(Error::NotSupported {
message: "unset_lsm_write_spec is not supported on LanceDB cloud.".into(),
})
self.check_mutable().await?;
let request = self.client.post(&format!(
"/v1/table/{}/unset_lsm_write_spec/",
self.identifier
));
let (request_id, response) = self.send(request, true).await?;
self.check_table_response(&request_id, response).await?;
Ok(())
}
async fn tags(&self) -> Result<Box<dyn Tags + '_>> {
@@ -4376,6 +4417,91 @@ mod tests {
assert!(matches!(e, Error::IndexNotFound { .. }));
}
#[tokio::test]
async fn test_set_lsm_write_spec_unsharded() {
let table = Table::new_with_handler("my_table", |request| {
assert_eq!(request.method(), "POST");
assert_eq!(
request.url().path(),
"/v1/table/my_table/set_lsm_write_spec/"
);
let body = request.body().unwrap().as_bytes().unwrap();
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
assert_eq!(body["sharding"], serde_json::json!({ "mode": "unsharded" }));
assert_eq!(body["maintained_indexes"], serde_json::json!(["id_idx"]));
assert_eq!(
body["writer_config_defaults"],
serde_json::json!({ "max_memtable_rows": "1000" })
);
http::Response::builder()
.status(200)
.body(r#"{"maintained_indexes":["id_idx"]}"#)
.unwrap()
});
let spec = crate::table::LsmWriteSpec::unsharded()
.with_maintained_indexes(["id_idx"])
.with_writer_config_defaults([("max_memtable_rows", "1000")]);
table.set_lsm_write_spec(spec).await.unwrap();
}
#[tokio::test]
async fn test_set_lsm_write_spec_bucket() {
let table = Table::new_with_handler("my_table", |request| {
assert_eq!(request.method(), "POST");
assert_eq!(
request.url().path(),
"/v1/table/my_table/set_lsm_write_spec/"
);
let body = request.body().unwrap().as_bytes().unwrap();
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
assert_eq!(
body["sharding"],
serde_json::json!({ "mode": "bucket", "column": "id", "num_buckets": 16 })
);
assert_eq!(body["maintained_indexes"], serde_json::json!([]));
http::Response::builder().status(200).body("{}").unwrap()
});
table
.set_lsm_write_spec(crate::table::LsmWriteSpec::bucket("id", 16))
.await
.unwrap();
}
#[tokio::test]
async fn test_set_lsm_write_spec_identity() {
let table = Table::new_with_handler("my_table", |request| {
assert_eq!(request.method(), "POST");
assert_eq!(
request.url().path(),
"/v1/table/my_table/set_lsm_write_spec/"
);
let body = request.body().unwrap().as_bytes().unwrap();
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
assert_eq!(
body["sharding"],
serde_json::json!({ "mode": "identity", "column": "tenant" })
);
http::Response::builder().status(200).body("{}").unwrap()
});
table
.set_lsm_write_spec(crate::table::LsmWriteSpec::identity("tenant"))
.await
.unwrap();
}
#[tokio::test]
async fn test_unset_lsm_write_spec() {
let table = Table::new_with_handler("my_table", |request| {
assert_eq!(request.method(), "POST");
assert_eq!(
request.url().path(),
"/v1/table/my_table/unset_lsm_write_spec/"
);
http::Response::builder().status(200).body("{}").unwrap()
});
table.unset_lsm_write_spec().await.unwrap();
}
#[tokio::test]
async fn test_wait_for_index() {
let table = _make_table_with_indices(0);

View File

@@ -870,8 +870,10 @@ mod tests {
.await
.unwrap();
// Should return empty or nearly empty result
assert!(result[0].num_rows() <= 1);
assert_eq!(
result.iter().map(|batch| batch.num_rows()).sum::<usize>(),
0
);
}
#[tokio::test]