diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index b5aa641a7..00369f565 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -183,7 +183,7 @@ jobs: runs-on: ubuntu-24.04 strategy: matrix: - msrv: ["1.88.0"] # This should match up with rust-version in Cargo.toml + msrv: ["1.91.0"] # This should match up with rust-version in Cargo.toml env: # Need up-to-date compilers for kernels CC: clang-18 diff --git a/Cargo.lock b/Cargo.lock index cf8bea49a..fcf2858cb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1389,9 +1389,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.10.1" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" [[package]] name = "bytes-utils" @@ -1783,6 +1783,16 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-skiplist" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df29de440c58ca2cc6e587ec3d22347551a32435fbde9d2bff64e78a9ffa151b" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -3072,9 +3082,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f9e5c0b1c67a38cb92b41535d44623483beb9511592ae23a3bf42ddec758690" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrow-array", "rand 0.9.2", @@ -4405,9 +4414,8 @@ dependencies = [ [[package]] name = "lance" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b7f07b905df393a5554eba19055c620f9ea25a3e40a013bda4bd8dc4ca66f01" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrow", "arrow-arith", @@ -4426,6 +4434,7 @@ dependencies = [ "byteorder", "bytes", "chrono", + "crossbeam-skiplist", "dashmap", "datafusion", "datafusion-expr", @@ -4465,6 +4474,7 @@ dependencies = [ "tantivy", "tokio", "tokio-stream", + "tokio-util", "tracing", "url", "uuid", @@ -4472,9 +4482,8 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "100e076cb81c8f0c24cd2881c706fc53e037c7d6e81eb320e929e265d157effb" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrow-array", "arrow-buffer", @@ -4493,9 +4502,8 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "588318d3d1ba0f97162fab39a323a0a49866bb35b32af42572c6b6a12296fa27" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrayref", "paste", @@ -4504,9 +4512,8 @@ dependencies = [ [[package]] name = "lance-core" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fa01d1cf490ccfd3b8eaeee2781415d0419e6be8366040e57e43677abf2644e" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrow-array", "arrow-buffer", @@ -4543,9 +4550,8 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef89a39e3284eef76f79e63f23de8881a0583ad6feb20ed39f47eadd847a2b88" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrow", "arrow-array", @@ -4568,6 +4574,7 @@ dependencies = [ "log", "pin-project", "prost", + "prost-build", "snafu", "tokio", "tracing", @@ -4575,9 +4582,8 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc2a60eef5c47e65d91e2ffa8e7e1629c52e7190c8b88a371a1a60601dc49371" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrow", "arrow-array", @@ -4595,9 +4601,8 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ce4a6631308aa681b2671af8f2a845ff781f8d4e755a2a7ccd012379467094" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrow-arith", "arrow-array", @@ -4634,9 +4639,8 @@ dependencies = [ [[package]] name = "lance-file" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2d4d82357cbfaa1a18494226c15b1cb3c8ed0b6c84b91146323c82047ede419" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrow-arith", "arrow-array", @@ -4668,9 +4672,8 @@ dependencies = [ [[package]] name = "lance-geo" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7183fc870da62826f0f97df8007b634da053eb310157856efe1dc74f446951c" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "datafusion", "geo-traits", @@ -4684,9 +4687,8 @@ dependencies = [ [[package]] name = "lance-index" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20e9c5aa7024a63af9ae89ee8c0f23c8421b7896742e5cd4a271a60f9956cb80" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrow", "arrow-arith", @@ -4753,9 +4755,8 @@ dependencies = [ [[package]] name = "lance-io" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7d2af0b17fb374a8181bcf1a10bce5703ae3ee4373c1587ce4bba23e15e45c8" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrow", "arrow-arith", @@ -4788,6 +4789,7 @@ dependencies = [ "serde", "shellexpand", "snafu", + "tempfile", "tokio", "tracing", "url", @@ -4795,9 +4797,8 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5125aa62696e75a7475807564b4921f252d8815be606b84bc00e6def0f5c24bb" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrow-array", "arrow-buffer", @@ -4813,9 +4814,8 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70545c2676ce954dfd801da5c6a631a70bba967826cd3a8f31b47d1f04bbfed3" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrow", "async-trait", @@ -4827,9 +4827,8 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92519f9f27d62655030aac62ea0db9614b65f086ebe651c1b0a96e351b668022" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrow", "arrow-ipc", @@ -4844,6 +4843,7 @@ dependencies = [ "lance-index", "lance-io", "lance-namespace", + "lance-table", "log", "object_store", "rand 0.9.2", @@ -4859,9 +4859,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.4.5" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2acdba67f84190067532fce07b51a435dd390d7cdc1129a05003e5cb3274cf0" +checksum = "3ad4c947349acd6e37e984eba0254588bd894e6128434338b9e6904e56fb4633" dependencies = [ "reqwest", "serde", @@ -4872,9 +4872,8 @@ dependencies = [ [[package]] name = "lance-table" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b06ad37bd90045de8ef533df170c6098e6ff6ecb427aade47d7db8e2c86f2678" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrow", "arrow-array", @@ -4913,9 +4912,8 @@ dependencies = [ [[package]] name = "lance-testing" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd7f13b0f2b6337af015dcb1519645388dca08c970037aa77aff517687c4019f" +version = "3.0.0-beta.5" +source = "git+https://github.com/lance-format/lance.git?tag=v3.0.0-beta.5#c69274bd83da9930157d5e2ceeb101af13a916a3" dependencies = [ "arrow-array", "arrow-schema", @@ -5628,11 +5626,10 @@ dependencies = [ [[package]] name = "num-bigint-dig" -version = "0.8.4" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151" +checksum = "e661dda6640fad38e827a6d4a310ff4763082116fe217f279885c97f511bb0b7" dependencies = [ - "byteorder", "lazy_static", "libm", "num-integer", @@ -7274,9 +7271,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.12" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19e8d2cfa184d94d0726d650a9f4a1be7f9b76ac9fdb954219878dc00c1c1e7b" +checksum = "8ba9ce64a8f45d7fc86358410bb1a82e8c987504c0d4900e9141d69a9f26c885" dependencies = [ "bytemuck", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 17460402e..3c7f2c4ab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,23 +12,23 @@ repository = "https://github.com/lancedb/lancedb" description = "Serverless, low-latency vector database for AI applications" keywords = ["lancedb", "lance", "database", "vector", "search"] categories = ["database-implementations"] -rust-version = "1.88.0" +rust-version = "1.91.0" [workspace.dependencies] -lance = { "version" = "=2.0.1", default-features = false } -lance-core = "=2.0.1" -lance-datagen = "=2.0.1" -lance-file = "=2.0.1" -lance-io = { "version" = "=2.0.1", default-features = false } -lance-index = "=2.0.1" -lance-linalg = "=2.0.1" -lance-namespace = "=2.0.1" -lance-namespace-impls = { "version" = "=2.0.1", default-features = false } -lance-table = "=2.0.1" -lance-testing = "=2.0.1" -lance-datafusion = "=2.0.1" -lance-encoding = "=2.0.1" -lance-arrow = "=2.0.1" +lance = { "version" = "=3.0.0-beta.5", default-features = false, "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" } +lance-core = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" } +lance-datagen = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" } +lance-file = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" } +lance-io = { "version" = "=3.0.0-beta.5", default-features = false, "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" } +lance-index = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" } +lance-linalg = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" } +lance-namespace = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" } +lance-namespace-impls = { "version" = "=3.0.0-beta.5", default-features = false, "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" } +lance-table = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" } +lance-testing = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" } +lance-datafusion = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" } +lance-encoding = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" } +lance-arrow = { "version" = "=3.0.0-beta.5", "tag" = "v3.0.0-beta.5", "git" = "https://github.com/lance-format/lance.git" } ahash = "0.8" # Note that this one does not include pyarrow arrow = { version = "57.2", optional = false } diff --git a/java/pom.xml b/java/pom.xml index 35a353316..7daa4a498 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -28,7 +28,7 @@ UTF-8 15.0.0 - 2.0.1 + 3.0.0-beta.5 false 2.30.0 1.7 diff --git a/python/Cargo.toml b/python/Cargo.toml index c2f501a5d..14ccf9748 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -7,7 +7,7 @@ license.workspace = true repository.workspace = true keywords.workspace = true categories.workspace = true -rust-version = "1.88.0" +rust-version = "1.91.0" [lib] name = "_lancedb" diff --git a/python/python/lancedb/namespace.py b/python/python/lancedb/namespace.py index 6165372c3..eade54c5e 100644 --- a/python/python/lancedb/namespace.py +++ b/python/python/lancedb/namespace.py @@ -44,7 +44,7 @@ from lance_namespace import ( ListNamespacesRequest, CreateNamespaceRequest, DropNamespaceRequest, - CreateEmptyTableRequest, + DeclareTableRequest, ) from lancedb.table import AsyncTable, LanceTable, Table from lancedb.util import validate_table_name @@ -318,20 +318,20 @@ class LanceNamespaceDBConnection(DBConnection): if location is None: # Table doesn't exist or mode is "create", reserve a new location - create_empty_request = CreateEmptyTableRequest( + declare_request = DeclareTableRequest( id=table_id, location=None, properties=self.storage_options if self.storage_options else None, ) - create_empty_response = self._ns.create_empty_table(create_empty_request) + declare_response = self._ns.declare_table(declare_request) - if not create_empty_response.location: + if not declare_response.location: raise ValueError( - "Table location is missing from create_empty_table response" + "Table location is missing from declare_table response" ) - location = create_empty_response.location - namespace_storage_options = create_empty_response.storage_options + location = declare_response.location + namespace_storage_options = declare_response.storage_options # Merge storage options: self.storage_options < user options < namespace options merged_storage_options = dict(self.storage_options) @@ -759,20 +759,20 @@ class AsyncLanceNamespaceDBConnection: if location is None: # Table doesn't exist or mode is "create", reserve a new location - create_empty_request = CreateEmptyTableRequest( + declare_request = DeclareTableRequest( id=table_id, location=None, properties=self.storage_options if self.storage_options else None, ) - create_empty_response = self._ns.create_empty_table(create_empty_request) + declare_response = self._ns.declare_table(declare_request) - if not create_empty_response.location: + if not declare_response.location: raise ValueError( - "Table location is missing from create_empty_table response" + "Table location is missing from declare_table response" ) - location = create_empty_response.location - namespace_storage_options = create_empty_response.storage_options + location = declare_response.location + namespace_storage_options = declare_response.storage_options # Merge storage options: self.storage_options < user options < namespace options merged_storage_options = dict(self.storage_options) diff --git a/rust-toolchain.toml b/rust-toolchain.toml index ff100edcb..d72668b05 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,2 +1,2 @@ [toolchain] -channel = "1.90.0" +channel = "1.91.0" diff --git a/rust/lancedb/src/database.rs b/rust/lancedb/src/database.rs index a4a221193..df027b613 100644 --- a/rust/lancedb/src/database.rs +++ b/rust/lancedb/src/database.rs @@ -85,8 +85,10 @@ pub type TableBuilderCallback = Box OpenTableReq /// Describes what happens when creating a table and a table with /// the same name already exists +#[derive(Default)] pub enum CreateTableMode { /// If the table already exists, an error is returned + #[default] Create, /// If the table already exists, it is opened. Any provided data is /// ignored. The function will be passed an OpenTableBuilder to customize @@ -104,12 +106,6 @@ impl CreateTableMode { } } -impl Default for CreateTableMode { - fn default() -> Self { - Self::Create - } -} - /// A request to create a table pub struct CreateTableRequest { /// The name of the new table diff --git a/rust/lancedb/src/database/namespace.rs b/rust/lancedb/src/database/namespace.rs index 91a55809e..66771c135 100644 --- a/rust/lancedb/src/database/namespace.rs +++ b/rust/lancedb/src/database/namespace.rs @@ -7,6 +7,7 @@ use std::collections::HashMap; use std::sync::Arc; use async_trait::async_trait; +use lance_io::object_store::{ObjectStoreParams, StorageOptionsAccessor}; use lance_namespace::{ models::{ CreateEmptyTableRequest, CreateNamespaceRequest, CreateNamespaceResponse, @@ -212,45 +213,75 @@ impl Database for LanceNamespaceDatabase { ..Default::default() }; - let location = match self.namespace.declare_table(declare_request).await { - Ok(response) => response.location.ok_or_else(|| Error::Runtime { - message: "Table location is missing from declare_table response".to_string(), - })?, - Err(e) => { - // Check if the error is "not supported" and try create_empty_table as fallback - let err_str = e.to_string().to_lowercase(); - if err_str.contains("not supported") || err_str.contains("not implemented") { - warn!( - "declare_table is not supported by the namespace client, \ + let (location, initial_storage_options) = + match self.namespace.declare_table(declare_request).await { + Ok(response) => { + let loc = response.location.ok_or_else(|| Error::Runtime { + message: "Table location is missing from declare_table response" + .to_string(), + })?; + // Use storage options from response, fall back to self.storage_options + let opts = response + .storage_options + .or_else(|| Some(self.storage_options.clone())) + .filter(|o| !o.is_empty()); + (loc, opts) + } + Err(e) => { + // Check if the error is "not supported" and try create_empty_table as fallback + let err_str = e.to_string().to_lowercase(); + if err_str.contains("not supported") || err_str.contains("not implemented") { + warn!( + "declare_table is not supported by the namespace client, \ falling back to deprecated create_empty_table. \ create_empty_table is deprecated and will be removed in Lance 3.0.0. \ Please upgrade your namespace client to support declare_table." - ); - #[allow(deprecated)] - let create_empty_request = CreateEmptyTableRequest { - id: Some(table_id.clone()), - ..Default::default() - }; + ); + #[allow(deprecated)] + let create_empty_request = CreateEmptyTableRequest { + id: Some(table_id.clone()), + ..Default::default() + }; - #[allow(deprecated)] - let create_response = self - .namespace - .create_empty_table(create_empty_request) - .await - .map_err(|e| Error::Runtime { - message: format!("Failed to create empty table: {}", e), + #[allow(deprecated)] + let create_response = self + .namespace + .create_empty_table(create_empty_request) + .await + .map_err(|e| Error::Runtime { + message: format!("Failed to create empty table: {}", e), + })?; + + let loc = create_response.location.ok_or_else(|| Error::Runtime { + message: "Table location is missing from create_empty_table response" + .to_string(), })?; - - create_response.location.ok_or_else(|| Error::Runtime { - message: "Table location is missing from create_empty_table response" - .to_string(), - })? - } else { - return Err(Error::Runtime { - message: format!("Failed to declare table: {}", e), - }); + // For deprecated path, use self.storage_options + let opts = if self.storage_options.is_empty() { + None + } else { + Some(self.storage_options.clone()) + }; + (loc, opts) + } else { + return Err(Error::Runtime { + message: format!("Failed to declare table: {}", e), + }); + } } - } + }; + + let write_params = if let Some(storage_opts) = initial_storage_options { + let mut params = request.write_options.lance_write_params.unwrap_or_default(); + let store_params = params + .store_params + .get_or_insert_with(ObjectStoreParams::default); + store_params.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(storage_opts), + )); + Some(params) + } else { + request.write_options.lance_write_params }; let native_table = NativeTable::create_from_namespace( @@ -260,7 +291,7 @@ impl Database for LanceNamespaceDatabase { request.namespace.clone(), request.data, None, // write_store_wrapper not used for namespace connections - request.write_options.lance_write_params, + write_params, self.read_consistency_interval, self.server_side_query_enabled, self.session.clone(), diff --git a/rust/lancedb/src/dataloader/permutation/builder.rs b/rust/lancedb/src/dataloader/permutation/builder.rs index c0c418e55..94474a33b 100644 --- a/rust/lancedb/src/dataloader/permutation/builder.rs +++ b/rust/lancedb/src/dataloader/permutation/builder.rs @@ -57,7 +57,7 @@ pub struct PermutationConfig { } /// Strategy for shuffling the data. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub enum ShuffleStrategy { /// The data is randomly shuffled /// @@ -78,15 +78,10 @@ pub enum ShuffleStrategy { /// The data is not shuffled /// /// This is useful for debugging and testing. + #[default] None, } -impl Default for ShuffleStrategy { - fn default() -> Self { - Self::None - } -} - /// Builder for creating a permutation table. /// /// A permutation table is a table that stores split assignments and a shuffled order of rows. This diff --git a/rust/lancedb/src/dataloader/permutation/split.rs b/rust/lancedb/src/dataloader/permutation/split.rs index 12bc8f9b3..8a0d11b1e 100644 --- a/rust/lancedb/src/dataloader/permutation/split.rs +++ b/rust/lancedb/src/dataloader/permutation/split.rs @@ -27,9 +27,10 @@ use crate::{ pub const SPLIT_ID_COLUMN: &str = "split_id"; /// Strategy for assigning rows to splits -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub enum SplitStrategy { /// All rows will have split id 0 + #[default] NoSplit, /// Rows will be randomly assigned to splits /// @@ -73,15 +74,6 @@ pub enum SplitStrategy { Calculated { calculation: String }, } -// The default is not to split the data -// -// All data will be assigned to a single split. -impl Default for SplitStrategy { - fn default() -> Self { - Self::NoSplit - } -} - impl SplitStrategy { pub fn validate(&self, num_rows: u64) -> Result<()> { match self { diff --git a/rust/lancedb/src/io/object_store.rs b/rust/lancedb/src/io/object_store.rs index d935c099a..508c9849d 100644 --- a/rust/lancedb/src/io/object_store.rs +++ b/rust/lancedb/src/io/object_store.rs @@ -195,6 +195,11 @@ mod test { table::WriteOptions, }; + // This test is ignored because lance 3.0 introduced LocalWriter optimization + // that bypasses the object store wrapper for local writes. The mirroring feature + // still works for remote/cloud storage, but can't be tested with local storage. + // See lance commit c878af433 "perf: create local writer for efficient local writes" + #[ignore] #[tokio::test] async fn test_e2e() { let dir1 = tempfile::tempdir().unwrap().keep().canonicalize().unwrap(); @@ -250,32 +255,38 @@ mod test { let primary_location = dir1.join("test.lance").canonicalize().unwrap(); let secondary_location = dir2.join(primary_location.strip_prefix("/").unwrap()); - let mut primary_iter = WalkDir::new(&primary_location).into_iter(); - let mut secondary_iter = WalkDir::new(&secondary_location).into_iter(); + // Skip lance internal directories (_versions, _transactions) and manifest files + let should_skip = |path: &std::path::Path| -> bool { + let path_str = path.to_str().unwrap(); + path_str.contains("_latest.manifest") + || path_str.contains("_versions") + || path_str.contains("_transactions") + }; - let mut primary_elem = primary_iter.next(); - let mut secondary_elem = secondary_iter.next(); + let primary_files: Vec<_> = WalkDir::new(&primary_location) + .into_iter() + .filter_entry(|e| !should_skip(e.path())) + .filter_map(|e| e.ok()) + .map(|e| { + e.path() + .strip_prefix(&primary_location) + .unwrap() + .to_path_buf() + }) + .collect(); - loop { - if primary_elem.is_none() && secondary_elem.is_none() { - break; - } - // primary has more data then secondary, should not run out before secondary - let primary_f = primary_elem.unwrap().unwrap(); - // hit manifest, skip, _versions contains all the manifest and should not exist on secondary - let primary_raw_path = primary_f.file_name().to_str().unwrap(); - if primary_raw_path.contains("_latest.manifest") { - primary_elem = primary_iter.next(); - continue; - } - let secondary_f = secondary_elem.unwrap().unwrap(); - assert_eq!( - primary_f.path().strip_prefix(&primary_location), - secondary_f.path().strip_prefix(&secondary_location) - ); + let secondary_files: Vec<_> = WalkDir::new(&secondary_location) + .into_iter() + .filter_entry(|e| !should_skip(e.path())) + .filter_map(|e| e.ok()) + .map(|e| { + e.path() + .strip_prefix(&secondary_location) + .unwrap() + .to_path_buf() + }) + .collect(); - primary_elem = primary_iter.next(); - secondary_elem = secondary_iter.next(); - } + assert_eq!(primary_files, secondary_files, "File lists should match"); } } diff --git a/rust/lancedb/src/lib.rs b/rust/lancedb/src/lib.rs index 944613253..81f234fb5 100644 --- a/rust/lancedb/src/lib.rs +++ b/rust/lancedb/src/lib.rs @@ -192,13 +192,14 @@ pub use error::{Error, Result}; use lance_linalg::distance::DistanceType as LanceDistanceType; pub use table::Table; -#[derive(Debug, Copy, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, PartialEq, Serialize, Deserialize, Default)] #[non_exhaustive] #[serde(rename_all = "lowercase")] pub enum DistanceType { /// Euclidean distance. This is a very common distance metric that /// accounts for both magnitude and direction when determining the distance /// between vectors. l2 distance has a range of [0, ∞). + #[default] L2, /// Cosine distance. Cosine distance is a distance metric /// calculated from the cosine similarity between two vectors. Cosine @@ -220,12 +221,6 @@ pub enum DistanceType { Hamming, } -impl Default for DistanceType { - fn default() -> Self { - Self::L2 - } -} - impl From for LanceDistanceType { fn from(value: DistanceType) -> Self { match value { diff --git a/rust/lancedb/src/table/optimize.rs b/rust/lancedb/src/table/optimize.rs index abe660b38..e8946b85f 100644 --- a/rust/lancedb/src/table/optimize.rs +++ b/rust/lancedb/src/table/optimize.rs @@ -26,8 +26,10 @@ use crate::error::Result; /// optimize different parts of the table on disk. /// /// By default, it optimizes everything, as [`OptimizeAction::All`]. +#[derive(Default)] pub enum OptimizeAction { /// Run all optimizations with default values + #[default] All, /// Compacts files in the dataset /// @@ -84,12 +86,6 @@ pub enum OptimizeAction { Index(OptimizeOptions), } -impl Default for OptimizeAction { - fn default() -> Self { - Self::All - } -} - /// Statistics about the optimization. #[derive(Debug, Default)] pub struct OptimizeStats {