feat: upgrade to lance v0.25.0-beta.5 (#2248)

- adds `loss` into the index stats for vector index
- now `optimize` can retrain the vector index

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
BubbleCal
2025-03-22 01:12:23 +08:00
committed by GitHub
parent ba1ded933a
commit 7ff6ec7fe3
11 changed files with 89 additions and 25 deletions

25
Cargo.lock generated
View File

@@ -2673,7 +2673,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "fsst"
version = "0.25.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
dependencies = [
"rand 0.8.5",
]
@@ -3651,7 +3651,7 @@ dependencies = [
[[package]]
name = "lance"
version = "0.25.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
dependencies = [
"arrow",
"arrow-arith",
@@ -3711,7 +3711,7 @@ dependencies = [
[[package]]
name = "lance-arrow"
version = "0.25.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -3729,7 +3729,7 @@ dependencies = [
[[package]]
name = "lance-core"
version = "0.25.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -3766,7 +3766,7 @@ dependencies = [
[[package]]
name = "lance-datafusion"
version = "0.25.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
dependencies = [
"arrow",
"arrow-array",
@@ -3787,12 +3787,13 @@ dependencies = [
"prost",
"snafu",
"tokio",
"tracing",
]
[[package]]
name = "lance-encoding"
version = "0.25.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
dependencies = [
"arrayref",
"arrow",
@@ -3831,7 +3832,7 @@ dependencies = [
[[package]]
name = "lance-file"
version = "0.25.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -3866,7 +3867,7 @@ dependencies = [
[[package]]
name = "lance-index"
version = "0.25.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
dependencies = [
"arrow",
"arrow-array",
@@ -3919,7 +3920,7 @@ dependencies = [
[[package]]
name = "lance-io"
version = "0.25.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
dependencies = [
"arrow",
"arrow-arith",
@@ -3958,7 +3959,7 @@ dependencies = [
[[package]]
name = "lance-linalg"
version = "0.25.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
dependencies = [
"arrow-array",
"arrow-ord",
@@ -3982,7 +3983,7 @@ dependencies = [
[[package]]
name = "lance-table"
version = "0.25.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
dependencies = [
"arrow",
"arrow-array",
@@ -4022,7 +4023,7 @@ dependencies = [
[[package]]
name = "lance-testing"
version = "0.25.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
dependencies = [
"arrow-array",
"arrow-schema",

View File

@@ -23,14 +23,14 @@ rust-version = "1.78.0"
[workspace.dependencies]
lance = { "version" = "=0.25.0", "features" = [
"dynamodb",
], tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
lance-io = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
lance-index = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
lance-linalg = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
lance-table = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
lance-testing = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
lance-datafusion = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
lance-encoding = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
], tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
lance-io = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
lance-index = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
lance-linalg = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
lance-table = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
lance-testing = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
lance-datafusion = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
lance-encoding = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
# Note that this one does not include pyarrow
arrow = { version = "54.1", optional = false }
arrow-array = "54.1"

View File

@@ -30,6 +30,17 @@ The type of the index
***
### loss?
```ts
optional loss: number;
```
The KMeans loss value of the index,
it is only present for vector indices.
***
### numIndexedRows
```ts

View File

@@ -470,6 +470,8 @@ describe("When creating an index", () => {
indexType: "IvfPq",
columns: ["vec"],
});
const stats = await tbl.indexStats("vec_idx");
expect(stats?.loss).toBeDefined();
// Search without specifying the column
let rst = await tbl
@@ -730,6 +732,7 @@ describe("When creating an index", () => {
expect(stats?.distanceType).toBeUndefined();
expect(stats?.indexType).toEqual("BTREE");
expect(stats?.numIndices).toEqual(1);
expect(stats?.loss).toBeUndefined();
});
test("when getting stats on non-existent index", async () => {

View File

@@ -498,6 +498,9 @@ pub struct IndexStatistics {
pub distance_type: Option<String>,
/// The number of parts this index is split into.
pub num_indices: Option<u32>,
/// The KMeans loss value of the index,
/// it is only present for vector indices.
pub loss: Option<f64>,
}
impl From<lancedb::index::IndexStatistics> for IndexStatistics {
fn from(value: lancedb::index::IndexStatistics) -> Self {
@@ -507,6 +510,7 @@ impl From<lancedb::index::IndexStatistics> for IndexStatistics {
index_type: value.index_type.to_string(),
distance_type: value.distance_type.map(|d| d.to_string()),
num_indices: value.num_indices,
loss: value.loss,
}
}
}

View File

@@ -1185,6 +1185,7 @@ class Table(ABC):
*,
cleanup_older_than: Optional[timedelta] = None,
delete_unverified: bool = False,
retrain: bool = False,
):
"""
Optimize the on-disk data and indices for better performance.
@@ -1208,6 +1209,11 @@ class Table(ABC):
in-progress operation (e.g. appending new data) and these files will not
be deleted unless they are at least 7 days old. If delete_unverified is True
then these files will be deleted regardless of their age.
retrain: bool, default False
If True, retrain the vector indices, this would refine the IVF clustering
and quantization, which may improve the search accuracy. It's faster than
re-creating the index from scratch, so it's recommended to try this first,
when the data distribution has changed significantly.
Experimental API
----------------
@@ -2342,6 +2348,7 @@ class LanceTable(Table):
*,
cleanup_older_than: Optional[timedelta] = None,
delete_unverified: bool = False,
retrain: bool = False,
):
"""
Optimize the on-disk data and indices for better performance.
@@ -2365,6 +2372,11 @@ class LanceTable(Table):
in-progress operation (e.g. appending new data) and these files will not
be deleted unless they are at least 7 days old. If delete_unverified is True
then these files will be deleted regardless of their age.
retrain: bool, default False
If True, retrain the vector indices, this would refine the IVF clustering
and quantization, which may improve the search accuracy. It's faster than
re-creating the index from scratch, so it's recommended to try this first,
when the data distribution has changed significantly.
Experimental API
----------------
@@ -2388,6 +2400,7 @@ class LanceTable(Table):
self._table.optimize(
cleanup_older_than=cleanup_older_than,
delete_unverified=delete_unverified,
retrain=retrain,
)
)
@@ -3590,6 +3603,7 @@ class AsyncTable:
*,
cleanup_older_than: Optional[timedelta] = None,
delete_unverified: bool = False,
retrain=False,
) -> OptimizeStats:
"""
Optimize the on-disk data and indices for better performance.
@@ -3613,6 +3627,11 @@ class AsyncTable:
in-progress operation (e.g. appending new data) and these files will not
be deleted unless they are at least 7 days old. If delete_unverified is True
then these files will be deleted regardless of their age.
retrain: bool, default False
If True, retrain the vector indices, this would refine the IVF clustering
and quantization, which may improve the search accuracy. It's faster than
re-creating the index from scratch, so it's recommended to try this first,
when the data distribution has changed significantly.
Experimental API
----------------
@@ -3636,7 +3655,9 @@ class AsyncTable:
if cleanup_older_than is not None:
cleanup_since_ms = round(cleanup_older_than.total_seconds() * 1000)
return await self._inner.optimize(
cleanup_since_ms=cleanup_since_ms, delete_unverified=delete_unverified
cleanup_since_ms=cleanup_since_ms,
delete_unverified=delete_unverified,
retrain=retrain,
)
async def list_indices(self) -> Iterable[IndexConfig]:
@@ -3729,6 +3750,8 @@ class IndexStatistics:
The distance type used by the index.
num_indices: Optional[int]
The number of parts the index is split into.
loss: Optional[float]
The KMeans loss for the index, for only vector indices.
"""
num_indexed_rows: int
@@ -3738,6 +3761,7 @@ class IndexStatistics:
]
distance_type: Optional[Literal["l2", "cosine", "dot"]] = None
num_indices: Optional[int] = None
loss: Optional[float] = None
# This exists for backwards compatibility with an older API, which returned
# a dictionary instead of a class.

View File

@@ -131,6 +131,7 @@ async def test_create_vector_index(some_table: AsyncTable):
assert stats.num_indexed_rows == await some_table.count_rows()
assert stats.num_unindexed_rows == 0
assert stats.num_indices == 1
assert stats.loss >= 0.0
@pytest.mark.asyncio
@@ -154,6 +155,7 @@ async def test_create_4bit_ivfpq_index(some_table: AsyncTable):
assert stats.num_indexed_rows == await some_table.count_rows()
assert stats.num_unindexed_rows == 0
assert stats.num_indices == 1
assert stats.loss >= 0.0
@pytest.mark.asyncio

View File

@@ -235,6 +235,10 @@ impl Table {
dict.set_item("num_indices", num_indices)?;
}
if let Some(loss) = stats.loss {
dict.set_item("loss", loss)?;
}
Ok(Some(dict.unbind()))
})
} else {
@@ -312,11 +316,12 @@ impl Table {
}
/// Optimize the on-disk data by compacting and pruning old data, for better performance.
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None))]
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None, retrain=None))]
pub fn optimize(
self_: PyRef<'_, Self>,
cleanup_since_ms: Option<u64>,
delete_unverified: Option<bool>,
retrain: Option<bool>,
) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone();
let older_than = if let Some(ms) = cleanup_since_ms {
@@ -352,9 +357,10 @@ impl Table {
.prune
.unwrap();
inner
.optimize(lancedb::table::OptimizeAction::Index(
OptimizeOptions::default(),
))
.optimize(lancedb::table::OptimizeAction::Index(match retrain {
Some(true) => OptimizeOptions::retrain(),
_ => OptimizeOptions::default(),
}))
.await
.infer_error()?;
Ok(OptimizeStats {

View File

@@ -174,6 +174,7 @@ pub(crate) struct IndexMetadata {
pub metric_type: Option<DistanceType>,
// Sometimes the index type is provided at this level.
pub index_type: Option<IndexType>,
pub loss: Option<f64>,
}
// This struct is used to deserialize the JSON data returned from the Lance API
@@ -205,4 +206,6 @@ pub struct IndexStatistics {
pub distance_type: Option<DistanceType>,
/// The number of parts this index is split into.
pub num_indices: Option<u32>,
/// The loss value used by the index.
pub loss: Option<f64>,
}

View File

@@ -1884,6 +1884,7 @@ mod tests {
index_type: IndexType::IvfPq,
distance_type: Some(DistanceType::L2),
num_indices: None,
loss: None,
};
assert_eq!(indices, expected);

View File

@@ -2373,12 +2373,20 @@ impl BaseTable for NativeTable {
.ok_or_else(|| Error::InvalidInput {
message: "index statistics was missing index type".to_string(),
})?;
let loss = stats
.indices
.iter()
.map(|index| index.loss.unwrap_or_default())
.sum::<f64>();
let loss = first_index.loss.map(|first_loss| first_loss + loss);
Ok(Some(IndexStatistics {
num_indexed_rows: stats.num_indexed_rows,
num_unindexed_rows: stats.num_unindexed_rows,
index_type,
distance_type: first_index.metric_type,
num_indices: stats.num_indices,
loss,
}))
}
}
@@ -3045,6 +3053,7 @@ mod tests {
assert_eq!(stats.num_unindexed_rows, 0);
assert_eq!(stats.index_type, crate::index::IndexType::IvfPq);
assert_eq!(stats.distance_type, Some(crate::DistanceType::L2));
assert!(stats.loss.is_some());
table.drop_index(index_name).await.unwrap();
assert_eq!(table.list_indices().await.unwrap().len(), 0);