feat!: upgrade Lance to 0.18.0 (#1657)

BREAKING CHANGE: default file format changed to Lance v2.0.

Upgrade Lance to 0.18.0

Change notes: https://github.com/lancedb/lance/releases/tag/v0.18.0
This commit is contained in:
LuQQiu
2024-09-19 10:50:26 -07:00
committed by GitHub
parent b3c0227065
commit abeaae3d80
12 changed files with 57 additions and 50 deletions

View File

@@ -20,13 +20,13 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
categories = ["database-implementations"]
[workspace.dependencies]
lance = { "version" = "=0.17.0", "features" = ["dynamodb"] }
lance-index = { "version" = "=0.17.0" }
lance-linalg = { "version" = "=0.17.0" }
lance-table = { "version" = "=0.17.0" }
lance-testing = { "version" = "=0.17.0" }
lance-datafusion = { "version" = "=0.17.0" }
lance-encoding = { "version" = "=0.17.0" }
lance = { "version" = "=0.18.0", "features" = ["dynamodb"] }
lance-index = { "version" = "=0.18.0" }
lance-linalg = { "version" = "=0.18.0" }
lance-table = { "version" = "=0.18.0" }
lance-testing = { "version" = "=0.18.0" }
lance-datafusion = { "version" = "=0.18.0" }
lance-encoding = { "version" = "=0.18.0" }
# Note that this one does not include pyarrow
arrow = { version = "52.2", optional = false }
arrow-array = "52.2"

View File

@@ -107,7 +107,7 @@ describe("given a connection", () => {
const data = [...Array(10000).keys()].map((i) => ({ id: i }));
// Create in v1 mode
let table = await db.createTable("test", data);
let table = await db.createTable("test", data, { useLegacyFormat: true });
const isV2 = async (table: Table) => {
const data = await table.query().toArrow({ maxBatchLength: 100000 });
@@ -118,7 +118,7 @@ describe("given a connection", () => {
await expect(isV2(table)).resolves.toBe(false);
// Create in v2 mode
table = await db.createTable("test_v2", data, { useLegacyFormat: false });
table = await db.createTable("test_v2", data);
await expect(isV2(table)).resolves.toBe(true);

View File

@@ -44,11 +44,12 @@ export interface CreateTableOptions {
* The available options are described at https://lancedb.github.io/lancedb/guides/storage/
*/
storageOptions?: Record<string, string>;
/**
* The version of the data storage format to use.
*
* The default is `legacy`, which is Lance format v1.
* `stable` is the new format, which is Lance format v2.
* The default is `stable`.
* Set to "legacy" to use the old format.
*/
dataStorageVersion?: string;
@@ -64,9 +65,9 @@ export interface CreateTableOptions {
/**
* If true then data files will be written with the legacy format
*
* The default is true while the new format is in beta
* The default is false.
*
* Deprecated.
* Deprecated. Use data storage version instead.
*/
useLegacyFormat?: boolean;
schema?: SchemaLike;
@@ -266,7 +267,7 @@ export class LocalConnection extends Connection {
throw new Error("data is required");
}
const { buf, mode } = await Table.parseTableData(data, options);
let dataStorageVersion = "legacy";
let dataStorageVersion = "stable";
if (options?.dataStorageVersion !== undefined) {
dataStorageVersion = options.dataStorageVersion;
} else if (options?.useLegacyFormat !== undefined) {
@@ -303,7 +304,7 @@ export class LocalConnection extends Connection {
metadata = registry.getTableMetadata([embeddingFunction]);
}
let dataStorageVersion = "legacy";
let dataStorageVersion = "stable";
if (options?.dataStorageVersion !== undefined) {
dataStorageVersion = options.dataStorageVersion;
} else if (options?.useLegacyFormat !== undefined) {

View File

@@ -130,6 +130,7 @@ impl Connection {
.map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?;
let mode = Self::parse_create_mode_str(&mode)?;
let mut builder = self.get_inner()?.create_table(&name, batches).mode(mode);
if let Some(storage_options) = storage_options {
for (key, value) in storage_options {
builder = builder.storage_option(key, value);

View File

@@ -156,7 +156,7 @@ impl Table {
&self,
only_if: Option<String>,
columns: Vec<(String, String)>,
) -> napi::Result<()> {
) -> napi::Result<u64> {
let mut op = self.inner_ref()?.update();
if let Some(only_if) = only_if {
op = op.only_if(only_if);

View File

@@ -3,7 +3,7 @@ name = "lancedb"
# version in Cargo.toml
dependencies = [
"deprecation",
"pylance==0.17.0",
"pylance==0.18.0",
"requests>=2.31.0",
"retry>=0.9.2",
"tqdm>=4.27.0",

View File

@@ -610,14 +610,13 @@ class AsyncConnection(object):
connection will be inherited by the table, but can be overridden here.
See available options at
https://lancedb.github.io/lancedb/guides/storage/
data_storage_version: optional, str, default "legacy"
data_storage_version: optional, str, default "stable"
The version of the data storage format to use. Newer versions are more
efficient but require newer versions of lance to read. The default is
"legacy" which will use the legacy v1 version. See the user guide
"stable" which will use the legacy v2 version. See the user guide
for more details.
use_legacy_format: bool, optional, default True. (Deprecated)
use_legacy_format: bool, optional, default False. (Deprecated)
If True, use the legacy format for the table. If False, use the new format.
The default is True while the new format is in beta.
This method is deprecated, use `data_storage_version` instead.
enable_v2_manifest_paths: bool, optional, default False
Use the new V2 manifest paths. These paths provide more efficient
@@ -759,9 +758,7 @@ class AsyncConnection(object):
mode = "exist_ok"
if not data_storage_version:
data_storage_version = (
"legacy" if use_legacy_format is None or use_legacy_format else "stable"
)
data_storage_version = "legacy" if use_legacy_format else "stable"
if data is None:
new_table = await self._inner.create_empty_table(

View File

@@ -594,7 +594,9 @@ async def test_create_in_v2_mode(tmp_path):
db = await lancedb.connect_async(tmp_path)
# Create table in v1 mode
tbl = await db.create_table("test", data=make_data(), schema=schema)
tbl = await db.create_table(
"test", data=make_data(), schema=schema, data_storage_version="legacy"
)
async def is_in_v2_mode(tbl):
batches = await tbl.query().to_batches(max_batch_length=1024 * 10)
@@ -626,7 +628,9 @@ async def test_create_in_v2_mode(tmp_path):
assert await is_in_v2_mode(tbl)
# Create empty table uses v1 mode by default
tbl = await db.create_table("test_empty_v2_default", data=None, schema=schema)
tbl = await db.create_table(
"test_empty_v2_default", data=None, schema=schema, data_storage_version="legacy"
)
await tbl.add(make_table())
assert not await is_in_v2_mode(tbl)

View File

@@ -307,7 +307,7 @@ impl<const HAS_DATA: bool, T: IntoArrow> CreateTableBuilder<HAS_DATA, T> {
/// Set the data storage version.
///
/// The default is `LanceFileVersion::Legacy`.
/// The default is `LanceFileVersion::Stable`.
pub fn data_storage_version(mut self, data_storage_version: LanceFileVersion) -> Self {
self.data_storage_version = Some(data_storage_version);
self
@@ -315,13 +315,9 @@ impl<const HAS_DATA: bool, T: IntoArrow> CreateTableBuilder<HAS_DATA, T> {
/// Set to true to use the v1 format for data files
///
/// This is currently defaulted to true and can be set to false to opt-in
/// to the new format. This should only be used for experimentation and
/// evaluation. The new format is still in beta and may change in ways that
/// are not backwards compatible.
///
/// Once the new format is stable, the default will change to `false` for
/// several releases and then eventually this option will be removed.
/// This is set to false by default to enable the stable format.
/// This should only be used for experimentation and
/// evaluation. This option may be removed in the future releases.
#[deprecated(since = "0.9.0", note = "use data_storage_version instead")]
pub fn use_legacy_format(mut self, use_legacy_format: bool) -> Self {
self.data_storage_version = if use_legacy_format {
@@ -1240,6 +1236,7 @@ mod tests {
let tbl = db
.create_table("v1_test", make_data())
.data_storage_version(LanceFileVersion::Legacy)
.execute()
.await
.unwrap();

View File

@@ -145,7 +145,7 @@ impl SentenceTransformersEmbeddingsBuilder {
let device = self.device.unwrap_or(Device::Cpu);
let repo = if let Some(revision) = self.revision {
Repo::with_revision(model_id, RepoType::Model, revision.to_string())
Repo::with_revision(model_id, RepoType::Model, revision)
} else {
Repo::new(model_id, RepoType::Model)
};

View File

@@ -229,7 +229,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
message: "plain_query is not yet supported on LanceDB cloud.".into(),
})
}
async fn update(&self, update: UpdateBuilder) -> Result<()> {
async fn update(&self, update: UpdateBuilder) -> Result<u64> {
let request = self.client.post(&format!("/table/{}/update/", self.name));
let mut updates = Vec::new();
@@ -245,9 +245,16 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
let response = self.client.send(request).await?;
self.check_table_response(response).await?;
let response = self.check_table_response(response).await?;
Ok(())
let body = response.text().await?;
serde_json::from_str(&body).map_err(|e| Error::Http {
message: format!(
"Failed to parse updated rows result from response {}: {}",
body, e
),
})
}
async fn delete(&self, predicate: &str) -> Result<()> {
let body = serde_json::json!({ "predicate": predicate });
@@ -395,7 +402,7 @@ mod tests {
Box::pin(table.version().map_ok(|_| ())),
Box::pin(table.schema().map_ok(|_| ())),
Box::pin(table.count_rows(None).map_ok(|_| ())),
Box::pin(table.update().column("a", "a + 1").execute()),
Box::pin(table.update().column("a", "a + 1").execute().map_ok(|_| ())),
Box::pin(table.add(example_data()).execute().map_ok(|_| ())),
Box::pin(table.merge_insert(&["test"]).execute(example_data())),
Box::pin(table.delete("false")), // TODO: other endpoints.
@@ -619,7 +626,7 @@ mod tests {
assert_eq!(only_if, "b > 10");
}
http::Response::builder().status(200).body("").unwrap()
http::Response::builder().status(200).body("1").unwrap()
});
table

View File

@@ -349,8 +349,9 @@ impl UpdateBuilder {
self
}
/// Executes the update operation
pub async fn execute(self) -> Result<()> {
/// Executes the update operation.
/// Returns the number of rows that were updated.
pub async fn execute(self) -> Result<u64> {
if self.columns.is_empty() {
Err(Error::InvalidInput {
message: "at least one column must be specified in an update operation".to_string(),
@@ -396,7 +397,7 @@ pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Syn
data: Box<dyn arrow_array::RecordBatchReader + Send>,
) -> Result<()>;
async fn delete(&self, predicate: &str) -> Result<()>;
async fn update(&self, update: UpdateBuilder) -> Result<()>;
async fn update(&self, update: UpdateBuilder) -> Result<u64>;
async fn create_index(&self, index: IndexBuilder) -> Result<()>;
async fn list_indices(&self) -> Result<Vec<IndexConfig>>;
async fn merge_insert(
@@ -1782,9 +1783,6 @@ impl TableInternal for NativeTable {
let data =
MaybeEmbedded::try_new(data, self.table_definition().await?, add.embedding_registry)?;
// Still use the legacy lance format (v1) by default.
// We don't want to accidentally switch to v2 format during an add operation.
// If the table is already v2 this won't have any effect.
let mut lance_params = add.write_options.lance_write_params.unwrap_or(WriteParams {
mode: match add.mode {
AddDataMode::Append => WriteMode::Append,
@@ -1846,7 +1844,7 @@ impl TableInternal for NativeTable {
}
}
async fn update(&self, update: UpdateBuilder) -> Result<()> {
async fn update(&self, update: UpdateBuilder) -> Result<u64> {
let dataset = self.dataset.get().await?.clone();
let mut builder = LanceUpdateBuilder::new(Arc::new(dataset));
if let Some(predicate) = update.filter {
@@ -1858,9 +1856,11 @@ impl TableInternal for NativeTable {
}
let operation = builder.build()?;
let ds = operation.execute().await?;
self.dataset.set_latest(ds.as_ref().clone()).await;
Ok(())
let res = operation.execute().await?;
self.dataset
.set_latest(res.new_dataset.as_ref().clone())
.await;
Ok(res.rows_updated)
}
async fn build_plan(