mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-22 21:09:58 +00:00
feat!: upgrade Lance to 0.18.0 (#1657)
BREAKING CHANGE: default file format changed to Lance v2.0. Upgrade Lance to 0.18.0 Change notes: https://github.com/lancedb/lance/releases/tag/v0.18.0
This commit is contained in:
14
Cargo.toml
14
Cargo.toml
@@ -20,13 +20,13 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
|
||||
categories = ["database-implementations"]
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.17.0", "features" = ["dynamodb"] }
|
||||
lance-index = { "version" = "=0.17.0" }
|
||||
lance-linalg = { "version" = "=0.17.0" }
|
||||
lance-table = { "version" = "=0.17.0" }
|
||||
lance-testing = { "version" = "=0.17.0" }
|
||||
lance-datafusion = { "version" = "=0.17.0" }
|
||||
lance-encoding = { "version" = "=0.17.0" }
|
||||
lance = { "version" = "=0.18.0", "features" = ["dynamodb"] }
|
||||
lance-index = { "version" = "=0.18.0" }
|
||||
lance-linalg = { "version" = "=0.18.0" }
|
||||
lance-table = { "version" = "=0.18.0" }
|
||||
lance-testing = { "version" = "=0.18.0" }
|
||||
lance-datafusion = { "version" = "=0.18.0" }
|
||||
lance-encoding = { "version" = "=0.18.0" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "52.2", optional = false }
|
||||
arrow-array = "52.2"
|
||||
|
||||
@@ -107,7 +107,7 @@ describe("given a connection", () => {
|
||||
const data = [...Array(10000).keys()].map((i) => ({ id: i }));
|
||||
|
||||
// Create in v1 mode
|
||||
let table = await db.createTable("test", data);
|
||||
let table = await db.createTable("test", data, { useLegacyFormat: true });
|
||||
|
||||
const isV2 = async (table: Table) => {
|
||||
const data = await table.query().toArrow({ maxBatchLength: 100000 });
|
||||
@@ -118,7 +118,7 @@ describe("given a connection", () => {
|
||||
await expect(isV2(table)).resolves.toBe(false);
|
||||
|
||||
// Create in v2 mode
|
||||
table = await db.createTable("test_v2", data, { useLegacyFormat: false });
|
||||
table = await db.createTable("test_v2", data);
|
||||
|
||||
await expect(isV2(table)).resolves.toBe(true);
|
||||
|
||||
|
||||
@@ -44,11 +44,12 @@ export interface CreateTableOptions {
|
||||
* The available options are described at https://lancedb.github.io/lancedb/guides/storage/
|
||||
*/
|
||||
storageOptions?: Record<string, string>;
|
||||
|
||||
/**
|
||||
* The version of the data storage format to use.
|
||||
*
|
||||
* The default is `legacy`, which is Lance format v1.
|
||||
* `stable` is the new format, which is Lance format v2.
|
||||
* The default is `stable`.
|
||||
* Set to "legacy" to use the old format.
|
||||
*/
|
||||
dataStorageVersion?: string;
|
||||
|
||||
@@ -64,9 +65,9 @@ export interface CreateTableOptions {
|
||||
/**
|
||||
* If true then data files will be written with the legacy format
|
||||
*
|
||||
* The default is true while the new format is in beta
|
||||
* The default is false.
|
||||
*
|
||||
* Deprecated.
|
||||
* Deprecated. Use data storage version instead.
|
||||
*/
|
||||
useLegacyFormat?: boolean;
|
||||
schema?: SchemaLike;
|
||||
@@ -266,7 +267,7 @@ export class LocalConnection extends Connection {
|
||||
throw new Error("data is required");
|
||||
}
|
||||
const { buf, mode } = await Table.parseTableData(data, options);
|
||||
let dataStorageVersion = "legacy";
|
||||
let dataStorageVersion = "stable";
|
||||
if (options?.dataStorageVersion !== undefined) {
|
||||
dataStorageVersion = options.dataStorageVersion;
|
||||
} else if (options?.useLegacyFormat !== undefined) {
|
||||
@@ -303,7 +304,7 @@ export class LocalConnection extends Connection {
|
||||
metadata = registry.getTableMetadata([embeddingFunction]);
|
||||
}
|
||||
|
||||
let dataStorageVersion = "legacy";
|
||||
let dataStorageVersion = "stable";
|
||||
if (options?.dataStorageVersion !== undefined) {
|
||||
dataStorageVersion = options.dataStorageVersion;
|
||||
} else if (options?.useLegacyFormat !== undefined) {
|
||||
|
||||
@@ -130,6 +130,7 @@ impl Connection {
|
||||
.map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?;
|
||||
let mode = Self::parse_create_mode_str(&mode)?;
|
||||
let mut builder = self.get_inner()?.create_table(&name, batches).mode(mode);
|
||||
|
||||
if let Some(storage_options) = storage_options {
|
||||
for (key, value) in storage_options {
|
||||
builder = builder.storage_option(key, value);
|
||||
|
||||
@@ -156,7 +156,7 @@ impl Table {
|
||||
&self,
|
||||
only_if: Option<String>,
|
||||
columns: Vec<(String, String)>,
|
||||
) -> napi::Result<()> {
|
||||
) -> napi::Result<u64> {
|
||||
let mut op = self.inner_ref()?.update();
|
||||
if let Some(only_if) = only_if {
|
||||
op = op.only_if(only_if);
|
||||
|
||||
@@ -3,7 +3,7 @@ name = "lancedb"
|
||||
# version in Cargo.toml
|
||||
dependencies = [
|
||||
"deprecation",
|
||||
"pylance==0.17.0",
|
||||
"pylance==0.18.0",
|
||||
"requests>=2.31.0",
|
||||
"retry>=0.9.2",
|
||||
"tqdm>=4.27.0",
|
||||
|
||||
@@ -610,14 +610,13 @@ class AsyncConnection(object):
|
||||
connection will be inherited by the table, but can be overridden here.
|
||||
See available options at
|
||||
https://lancedb.github.io/lancedb/guides/storage/
|
||||
data_storage_version: optional, str, default "legacy"
|
||||
data_storage_version: optional, str, default "stable"
|
||||
The version of the data storage format to use. Newer versions are more
|
||||
efficient but require newer versions of lance to read. The default is
|
||||
"legacy" which will use the legacy v1 version. See the user guide
|
||||
"stable" which will use the legacy v2 version. See the user guide
|
||||
for more details.
|
||||
use_legacy_format: bool, optional, default True. (Deprecated)
|
||||
use_legacy_format: bool, optional, default False. (Deprecated)
|
||||
If True, use the legacy format for the table. If False, use the new format.
|
||||
The default is True while the new format is in beta.
|
||||
This method is deprecated, use `data_storage_version` instead.
|
||||
enable_v2_manifest_paths: bool, optional, default False
|
||||
Use the new V2 manifest paths. These paths provide more efficient
|
||||
@@ -759,9 +758,7 @@ class AsyncConnection(object):
|
||||
mode = "exist_ok"
|
||||
|
||||
if not data_storage_version:
|
||||
data_storage_version = (
|
||||
"legacy" if use_legacy_format is None or use_legacy_format else "stable"
|
||||
)
|
||||
data_storage_version = "legacy" if use_legacy_format else "stable"
|
||||
|
||||
if data is None:
|
||||
new_table = await self._inner.create_empty_table(
|
||||
|
||||
@@ -594,7 +594,9 @@ async def test_create_in_v2_mode(tmp_path):
|
||||
db = await lancedb.connect_async(tmp_path)
|
||||
|
||||
# Create table in v1 mode
|
||||
tbl = await db.create_table("test", data=make_data(), schema=schema)
|
||||
tbl = await db.create_table(
|
||||
"test", data=make_data(), schema=schema, data_storage_version="legacy"
|
||||
)
|
||||
|
||||
async def is_in_v2_mode(tbl):
|
||||
batches = await tbl.query().to_batches(max_batch_length=1024 * 10)
|
||||
@@ -626,7 +628,9 @@ async def test_create_in_v2_mode(tmp_path):
|
||||
assert await is_in_v2_mode(tbl)
|
||||
|
||||
# Create empty table uses v1 mode by default
|
||||
tbl = await db.create_table("test_empty_v2_default", data=None, schema=schema)
|
||||
tbl = await db.create_table(
|
||||
"test_empty_v2_default", data=None, schema=schema, data_storage_version="legacy"
|
||||
)
|
||||
await tbl.add(make_table())
|
||||
|
||||
assert not await is_in_v2_mode(tbl)
|
||||
|
||||
@@ -307,7 +307,7 @@ impl<const HAS_DATA: bool, T: IntoArrow> CreateTableBuilder<HAS_DATA, T> {
|
||||
|
||||
/// Set the data storage version.
|
||||
///
|
||||
/// The default is `LanceFileVersion::Legacy`.
|
||||
/// The default is `LanceFileVersion::Stable`.
|
||||
pub fn data_storage_version(mut self, data_storage_version: LanceFileVersion) -> Self {
|
||||
self.data_storage_version = Some(data_storage_version);
|
||||
self
|
||||
@@ -315,13 +315,9 @@ impl<const HAS_DATA: bool, T: IntoArrow> CreateTableBuilder<HAS_DATA, T> {
|
||||
|
||||
/// Set to true to use the v1 format for data files
|
||||
///
|
||||
/// This is currently defaulted to true and can be set to false to opt-in
|
||||
/// to the new format. This should only be used for experimentation and
|
||||
/// evaluation. The new format is still in beta and may change in ways that
|
||||
/// are not backwards compatible.
|
||||
///
|
||||
/// Once the new format is stable, the default will change to `false` for
|
||||
/// several releases and then eventually this option will be removed.
|
||||
/// This is set to false by default to enable the stable format.
|
||||
/// This should only be used for experimentation and
|
||||
/// evaluation. This option may be removed in the future releases.
|
||||
#[deprecated(since = "0.9.0", note = "use data_storage_version instead")]
|
||||
pub fn use_legacy_format(mut self, use_legacy_format: bool) -> Self {
|
||||
self.data_storage_version = if use_legacy_format {
|
||||
@@ -1240,6 +1236,7 @@ mod tests {
|
||||
|
||||
let tbl = db
|
||||
.create_table("v1_test", make_data())
|
||||
.data_storage_version(LanceFileVersion::Legacy)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -145,7 +145,7 @@ impl SentenceTransformersEmbeddingsBuilder {
|
||||
let device = self.device.unwrap_or(Device::Cpu);
|
||||
|
||||
let repo = if let Some(revision) = self.revision {
|
||||
Repo::with_revision(model_id, RepoType::Model, revision.to_string())
|
||||
Repo::with_revision(model_id, RepoType::Model, revision)
|
||||
} else {
|
||||
Repo::new(model_id, RepoType::Model)
|
||||
};
|
||||
|
||||
@@ -229,7 +229,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
||||
message: "plain_query is not yet supported on LanceDB cloud.".into(),
|
||||
})
|
||||
}
|
||||
async fn update(&self, update: UpdateBuilder) -> Result<()> {
|
||||
async fn update(&self, update: UpdateBuilder) -> Result<u64> {
|
||||
let request = self.client.post(&format!("/table/{}/update/", self.name));
|
||||
|
||||
let mut updates = Vec::new();
|
||||
@@ -245,9 +245,16 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
||||
|
||||
let response = self.client.send(request).await?;
|
||||
|
||||
self.check_table_response(response).await?;
|
||||
let response = self.check_table_response(response).await?;
|
||||
|
||||
Ok(())
|
||||
let body = response.text().await?;
|
||||
|
||||
serde_json::from_str(&body).map_err(|e| Error::Http {
|
||||
message: format!(
|
||||
"Failed to parse updated rows result from response {}: {}",
|
||||
body, e
|
||||
),
|
||||
})
|
||||
}
|
||||
async fn delete(&self, predicate: &str) -> Result<()> {
|
||||
let body = serde_json::json!({ "predicate": predicate });
|
||||
@@ -395,7 +402,7 @@ mod tests {
|
||||
Box::pin(table.version().map_ok(|_| ())),
|
||||
Box::pin(table.schema().map_ok(|_| ())),
|
||||
Box::pin(table.count_rows(None).map_ok(|_| ())),
|
||||
Box::pin(table.update().column("a", "a + 1").execute()),
|
||||
Box::pin(table.update().column("a", "a + 1").execute().map_ok(|_| ())),
|
||||
Box::pin(table.add(example_data()).execute().map_ok(|_| ())),
|
||||
Box::pin(table.merge_insert(&["test"]).execute(example_data())),
|
||||
Box::pin(table.delete("false")), // TODO: other endpoints.
|
||||
@@ -619,7 +626,7 @@ mod tests {
|
||||
assert_eq!(only_if, "b > 10");
|
||||
}
|
||||
|
||||
http::Response::builder().status(200).body("").unwrap()
|
||||
http::Response::builder().status(200).body("1").unwrap()
|
||||
});
|
||||
|
||||
table
|
||||
|
||||
@@ -349,8 +349,9 @@ impl UpdateBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Executes the update operation
|
||||
pub async fn execute(self) -> Result<()> {
|
||||
/// Executes the update operation.
|
||||
/// Returns the number of rows that were updated.
|
||||
pub async fn execute(self) -> Result<u64> {
|
||||
if self.columns.is_empty() {
|
||||
Err(Error::InvalidInput {
|
||||
message: "at least one column must be specified in an update operation".to_string(),
|
||||
@@ -396,7 +397,7 @@ pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Syn
|
||||
data: Box<dyn arrow_array::RecordBatchReader + Send>,
|
||||
) -> Result<()>;
|
||||
async fn delete(&self, predicate: &str) -> Result<()>;
|
||||
async fn update(&self, update: UpdateBuilder) -> Result<()>;
|
||||
async fn update(&self, update: UpdateBuilder) -> Result<u64>;
|
||||
async fn create_index(&self, index: IndexBuilder) -> Result<()>;
|
||||
async fn list_indices(&self) -> Result<Vec<IndexConfig>>;
|
||||
async fn merge_insert(
|
||||
@@ -1782,9 +1783,6 @@ impl TableInternal for NativeTable {
|
||||
let data =
|
||||
MaybeEmbedded::try_new(data, self.table_definition().await?, add.embedding_registry)?;
|
||||
|
||||
// Still use the legacy lance format (v1) by default.
|
||||
// We don't want to accidentally switch to v2 format during an add operation.
|
||||
// If the table is already v2 this won't have any effect.
|
||||
let mut lance_params = add.write_options.lance_write_params.unwrap_or(WriteParams {
|
||||
mode: match add.mode {
|
||||
AddDataMode::Append => WriteMode::Append,
|
||||
@@ -1846,7 +1844,7 @@ impl TableInternal for NativeTable {
|
||||
}
|
||||
}
|
||||
|
||||
async fn update(&self, update: UpdateBuilder) -> Result<()> {
|
||||
async fn update(&self, update: UpdateBuilder) -> Result<u64> {
|
||||
let dataset = self.dataset.get().await?.clone();
|
||||
let mut builder = LanceUpdateBuilder::new(Arc::new(dataset));
|
||||
if let Some(predicate) = update.filter {
|
||||
@@ -1858,9 +1856,11 @@ impl TableInternal for NativeTable {
|
||||
}
|
||||
|
||||
let operation = builder.build()?;
|
||||
let ds = operation.execute().await?;
|
||||
self.dataset.set_latest(ds.as_ref().clone()).await;
|
||||
Ok(())
|
||||
let res = operation.execute().await?;
|
||||
self.dataset
|
||||
.set_latest(res.new_dataset.as_ref().clone())
|
||||
.await;
|
||||
Ok(res.rows_updated)
|
||||
}
|
||||
|
||||
async fn build_plan(
|
||||
|
||||
Reference in New Issue
Block a user