From abeaae3d807cf59a1fcb22a50fbc75feb0d5a610 Mon Sep 17 00:00:00 2001 From: LuQQiu Date: Thu, 19 Sep 2024 10:50:26 -0700 Subject: [PATCH] feat!: upgrade Lance to 0.18.0 (#1657) BREAKING CHANGE: default file format changed to Lance v2.0. Upgrade Lance to 0.18.0 Change notes: https://github.com/lancedb/lance/releases/tag/v0.18.0 --- Cargo.toml | 14 ++++++------- nodejs/__test__/connection.test.ts | 4 ++-- nodejs/lancedb/connection.ts | 13 ++++++------ nodejs/src/connection.rs | 1 + nodejs/src/table.rs | 2 +- python/pyproject.toml | 2 +- python/python/lancedb/db.py | 11 ++++------ python/python/tests/test_db.py | 8 ++++++-- rust/lancedb/src/connection.rs | 13 +++++------- .../src/embeddings/sentence_transformers.rs | 2 +- rust/lancedb/src/remote/table.rs | 17 +++++++++++----- rust/lancedb/src/table.rs | 20 +++++++++---------- 12 files changed, 57 insertions(+), 50 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d9ec9a3d..3c563e7a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,13 +20,13 @@ keywords = ["lancedb", "lance", "database", "vector", "search"] categories = ["database-implementations"] [workspace.dependencies] -lance = { "version" = "=0.17.0", "features" = ["dynamodb"] } -lance-index = { "version" = "=0.17.0" } -lance-linalg = { "version" = "=0.17.0" } -lance-table = { "version" = "=0.17.0" } -lance-testing = { "version" = "=0.17.0" } -lance-datafusion = { "version" = "=0.17.0" } -lance-encoding = { "version" = "=0.17.0" } +lance = { "version" = "=0.18.0", "features" = ["dynamodb"] } +lance-index = { "version" = "=0.18.0" } +lance-linalg = { "version" = "=0.18.0" } +lance-table = { "version" = "=0.18.0" } +lance-testing = { "version" = "=0.18.0" } +lance-datafusion = { "version" = "=0.18.0" } +lance-encoding = { "version" = "=0.18.0" } # Note that this one does not include pyarrow arrow = { version = "52.2", optional = false } arrow-array = "52.2" diff --git a/nodejs/__test__/connection.test.ts b/nodejs/__test__/connection.test.ts index 3541e94f..070121f7 100644 --- a/nodejs/__test__/connection.test.ts +++ b/nodejs/__test__/connection.test.ts @@ -107,7 +107,7 @@ describe("given a connection", () => { const data = [...Array(10000).keys()].map((i) => ({ id: i })); // Create in v1 mode - let table = await db.createTable("test", data); + let table = await db.createTable("test", data, { useLegacyFormat: true }); const isV2 = async (table: Table) => { const data = await table.query().toArrow({ maxBatchLength: 100000 }); @@ -118,7 +118,7 @@ describe("given a connection", () => { await expect(isV2(table)).resolves.toBe(false); // Create in v2 mode - table = await db.createTable("test_v2", data, { useLegacyFormat: false }); + table = await db.createTable("test_v2", data); await expect(isV2(table)).resolves.toBe(true); diff --git a/nodejs/lancedb/connection.ts b/nodejs/lancedb/connection.ts index af177348..f352ef3a 100644 --- a/nodejs/lancedb/connection.ts +++ b/nodejs/lancedb/connection.ts @@ -44,11 +44,12 @@ export interface CreateTableOptions { * The available options are described at https://lancedb.github.io/lancedb/guides/storage/ */ storageOptions?: Record; + /** * The version of the data storage format to use. * - * The default is `legacy`, which is Lance format v1. - * `stable` is the new format, which is Lance format v2. + * The default is `stable`. + * Set to "legacy" to use the old format. */ dataStorageVersion?: string; @@ -64,9 +65,9 @@ export interface CreateTableOptions { /** * If true then data files will be written with the legacy format * - * The default is true while the new format is in beta + * The default is false. * - * Deprecated. + * Deprecated. Use data storage version instead. */ useLegacyFormat?: boolean; schema?: SchemaLike; @@ -266,7 +267,7 @@ export class LocalConnection extends Connection { throw new Error("data is required"); } const { buf, mode } = await Table.parseTableData(data, options); - let dataStorageVersion = "legacy"; + let dataStorageVersion = "stable"; if (options?.dataStorageVersion !== undefined) { dataStorageVersion = options.dataStorageVersion; } else if (options?.useLegacyFormat !== undefined) { @@ -303,7 +304,7 @@ export class LocalConnection extends Connection { metadata = registry.getTableMetadata([embeddingFunction]); } - let dataStorageVersion = "legacy"; + let dataStorageVersion = "stable"; if (options?.dataStorageVersion !== undefined) { dataStorageVersion = options.dataStorageVersion; } else if (options?.useLegacyFormat !== undefined) { diff --git a/nodejs/src/connection.rs b/nodejs/src/connection.rs index 6508e939..4a454bfa 100644 --- a/nodejs/src/connection.rs +++ b/nodejs/src/connection.rs @@ -130,6 +130,7 @@ impl Connection { .map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?; let mode = Self::parse_create_mode_str(&mode)?; let mut builder = self.get_inner()?.create_table(&name, batches).mode(mode); + if let Some(storage_options) = storage_options { for (key, value) in storage_options { builder = builder.storage_option(key, value); diff --git a/nodejs/src/table.rs b/nodejs/src/table.rs index 338677e4..7cf91357 100644 --- a/nodejs/src/table.rs +++ b/nodejs/src/table.rs @@ -156,7 +156,7 @@ impl Table { &self, only_if: Option, columns: Vec<(String, String)>, - ) -> napi::Result<()> { + ) -> napi::Result { let mut op = self.inner_ref()?.update(); if let Some(only_if) = only_if { op = op.only_if(only_if); diff --git a/python/pyproject.toml b/python/pyproject.toml index 31b3af2e..f2e82d2d 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -3,7 +3,7 @@ name = "lancedb" # version in Cargo.toml dependencies = [ "deprecation", - "pylance==0.17.0", + "pylance==0.18.0", "requests>=2.31.0", "retry>=0.9.2", "tqdm>=4.27.0", diff --git a/python/python/lancedb/db.py b/python/python/lancedb/db.py index 8195d09f..4a72b4b0 100644 --- a/python/python/lancedb/db.py +++ b/python/python/lancedb/db.py @@ -610,14 +610,13 @@ class AsyncConnection(object): connection will be inherited by the table, but can be overridden here. See available options at https://lancedb.github.io/lancedb/guides/storage/ - data_storage_version: optional, str, default "legacy" + data_storage_version: optional, str, default "stable" The version of the data storage format to use. Newer versions are more efficient but require newer versions of lance to read. The default is - "legacy" which will use the legacy v1 version. See the user guide + "stable" which will use the legacy v2 version. See the user guide for more details. - use_legacy_format: bool, optional, default True. (Deprecated) + use_legacy_format: bool, optional, default False. (Deprecated) If True, use the legacy format for the table. If False, use the new format. - The default is True while the new format is in beta. This method is deprecated, use `data_storage_version` instead. enable_v2_manifest_paths: bool, optional, default False Use the new V2 manifest paths. These paths provide more efficient @@ -759,9 +758,7 @@ class AsyncConnection(object): mode = "exist_ok" if not data_storage_version: - data_storage_version = ( - "legacy" if use_legacy_format is None or use_legacy_format else "stable" - ) + data_storage_version = "legacy" if use_legacy_format else "stable" if data is None: new_table = await self._inner.create_empty_table( diff --git a/python/python/tests/test_db.py b/python/python/tests/test_db.py index a7f3d33b..8bd7d3af 100644 --- a/python/python/tests/test_db.py +++ b/python/python/tests/test_db.py @@ -594,7 +594,9 @@ async def test_create_in_v2_mode(tmp_path): db = await lancedb.connect_async(tmp_path) # Create table in v1 mode - tbl = await db.create_table("test", data=make_data(), schema=schema) + tbl = await db.create_table( + "test", data=make_data(), schema=schema, data_storage_version="legacy" + ) async def is_in_v2_mode(tbl): batches = await tbl.query().to_batches(max_batch_length=1024 * 10) @@ -626,7 +628,9 @@ async def test_create_in_v2_mode(tmp_path): assert await is_in_v2_mode(tbl) # Create empty table uses v1 mode by default - tbl = await db.create_table("test_empty_v2_default", data=None, schema=schema) + tbl = await db.create_table( + "test_empty_v2_default", data=None, schema=schema, data_storage_version="legacy" + ) await tbl.add(make_table()) assert not await is_in_v2_mode(tbl) diff --git a/rust/lancedb/src/connection.rs b/rust/lancedb/src/connection.rs index e660c1e5..8d073764 100644 --- a/rust/lancedb/src/connection.rs +++ b/rust/lancedb/src/connection.rs @@ -307,7 +307,7 @@ impl CreateTableBuilder { /// Set the data storage version. /// - /// The default is `LanceFileVersion::Legacy`. + /// The default is `LanceFileVersion::Stable`. pub fn data_storage_version(mut self, data_storage_version: LanceFileVersion) -> Self { self.data_storage_version = Some(data_storage_version); self @@ -315,13 +315,9 @@ impl CreateTableBuilder { /// Set to true to use the v1 format for data files /// - /// This is currently defaulted to true and can be set to false to opt-in - /// to the new format. This should only be used for experimentation and - /// evaluation. The new format is still in beta and may change in ways that - /// are not backwards compatible. - /// - /// Once the new format is stable, the default will change to `false` for - /// several releases and then eventually this option will be removed. + /// This is set to false by default to enable the stable format. + /// This should only be used for experimentation and + /// evaluation. This option may be removed in the future releases. #[deprecated(since = "0.9.0", note = "use data_storage_version instead")] pub fn use_legacy_format(mut self, use_legacy_format: bool) -> Self { self.data_storage_version = if use_legacy_format { @@ -1240,6 +1236,7 @@ mod tests { let tbl = db .create_table("v1_test", make_data()) + .data_storage_version(LanceFileVersion::Legacy) .execute() .await .unwrap(); diff --git a/rust/lancedb/src/embeddings/sentence_transformers.rs b/rust/lancedb/src/embeddings/sentence_transformers.rs index 7a21130d..0d0c30f7 100644 --- a/rust/lancedb/src/embeddings/sentence_transformers.rs +++ b/rust/lancedb/src/embeddings/sentence_transformers.rs @@ -145,7 +145,7 @@ impl SentenceTransformersEmbeddingsBuilder { let device = self.device.unwrap_or(Device::Cpu); let repo = if let Some(revision) = self.revision { - Repo::with_revision(model_id, RepoType::Model, revision.to_string()) + Repo::with_revision(model_id, RepoType::Model, revision) } else { Repo::new(model_id, RepoType::Model) }; diff --git a/rust/lancedb/src/remote/table.rs b/rust/lancedb/src/remote/table.rs index c7a13b49..2a42ce67 100644 --- a/rust/lancedb/src/remote/table.rs +++ b/rust/lancedb/src/remote/table.rs @@ -229,7 +229,7 @@ impl TableInternal for RemoteTable { message: "plain_query is not yet supported on LanceDB cloud.".into(), }) } - async fn update(&self, update: UpdateBuilder) -> Result<()> { + async fn update(&self, update: UpdateBuilder) -> Result { let request = self.client.post(&format!("/table/{}/update/", self.name)); let mut updates = Vec::new(); @@ -245,9 +245,16 @@ impl TableInternal for RemoteTable { let response = self.client.send(request).await?; - self.check_table_response(response).await?; + let response = self.check_table_response(response).await?; - Ok(()) + let body = response.text().await?; + + serde_json::from_str(&body).map_err(|e| Error::Http { + message: format!( + "Failed to parse updated rows result from response {}: {}", + body, e + ), + }) } async fn delete(&self, predicate: &str) -> Result<()> { let body = serde_json::json!({ "predicate": predicate }); @@ -395,7 +402,7 @@ mod tests { Box::pin(table.version().map_ok(|_| ())), Box::pin(table.schema().map_ok(|_| ())), Box::pin(table.count_rows(None).map_ok(|_| ())), - Box::pin(table.update().column("a", "a + 1").execute()), + Box::pin(table.update().column("a", "a + 1").execute().map_ok(|_| ())), Box::pin(table.add(example_data()).execute().map_ok(|_| ())), Box::pin(table.merge_insert(&["test"]).execute(example_data())), Box::pin(table.delete("false")), // TODO: other endpoints. @@ -619,7 +626,7 @@ mod tests { assert_eq!(only_if, "b > 10"); } - http::Response::builder().status(200).body("").unwrap() + http::Response::builder().status(200).body("1").unwrap() }); table diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index 0044d0fc..8096a8da 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -349,8 +349,9 @@ impl UpdateBuilder { self } - /// Executes the update operation - pub async fn execute(self) -> Result<()> { + /// Executes the update operation. + /// Returns the number of rows that were updated. + pub async fn execute(self) -> Result { if self.columns.is_empty() { Err(Error::InvalidInput { message: "at least one column must be specified in an update operation".to_string(), @@ -396,7 +397,7 @@ pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Syn data: Box, ) -> Result<()>; async fn delete(&self, predicate: &str) -> Result<()>; - async fn update(&self, update: UpdateBuilder) -> Result<()>; + async fn update(&self, update: UpdateBuilder) -> Result; async fn create_index(&self, index: IndexBuilder) -> Result<()>; async fn list_indices(&self) -> Result>; async fn merge_insert( @@ -1782,9 +1783,6 @@ impl TableInternal for NativeTable { let data = MaybeEmbedded::try_new(data, self.table_definition().await?, add.embedding_registry)?; - // Still use the legacy lance format (v1) by default. - // We don't want to accidentally switch to v2 format during an add operation. - // If the table is already v2 this won't have any effect. let mut lance_params = add.write_options.lance_write_params.unwrap_or(WriteParams { mode: match add.mode { AddDataMode::Append => WriteMode::Append, @@ -1846,7 +1844,7 @@ impl TableInternal for NativeTable { } } - async fn update(&self, update: UpdateBuilder) -> Result<()> { + async fn update(&self, update: UpdateBuilder) -> Result { let dataset = self.dataset.get().await?.clone(); let mut builder = LanceUpdateBuilder::new(Arc::new(dataset)); if let Some(predicate) = update.filter { @@ -1858,9 +1856,11 @@ impl TableInternal for NativeTable { } let operation = builder.build()?; - let ds = operation.execute().await?; - self.dataset.set_latest(ds.as_ref().clone()).await; - Ok(()) + let res = operation.execute().await?; + self.dataset + .set_latest(res.new_dataset.as_ref().clone()) + .await; + Ok(res.rows_updated) } async fn build_plan(