mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-05 11:22:58 +00:00
feat: upgrade to lance v0.25.0-beta.5 (#2248)
- adds `loss` into the index stats for vector index - now `optimize` can retrain the vector index --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
25
Cargo.lock
generated
25
Cargo.lock
generated
@@ -2673,7 +2673,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
||||
[[package]]
|
||||
name = "fsst"
|
||||
version = "0.25.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
|
||||
dependencies = [
|
||||
"rand 0.8.5",
|
||||
]
|
||||
@@ -3651,7 +3651,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "lance"
|
||||
version = "0.25.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-arith",
|
||||
@@ -3711,7 +3711,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "lance-arrow"
|
||||
version = "0.25.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -3729,7 +3729,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "lance-core"
|
||||
version = "0.25.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -3766,7 +3766,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "lance-datafusion"
|
||||
version = "0.25.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -3787,12 +3787,13 @@ dependencies = [
|
||||
"prost",
|
||||
"snafu",
|
||||
"tokio",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lance-encoding"
|
||||
version = "0.25.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
|
||||
dependencies = [
|
||||
"arrayref",
|
||||
"arrow",
|
||||
@@ -3831,7 +3832,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "lance-file"
|
||||
version = "0.25.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
|
||||
dependencies = [
|
||||
"arrow-arith",
|
||||
"arrow-array",
|
||||
@@ -3866,7 +3867,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "lance-index"
|
||||
version = "0.25.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -3919,7 +3920,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "lance-io"
|
||||
version = "0.25.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-arith",
|
||||
@@ -3958,7 +3959,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "lance-linalg"
|
||||
version = "0.25.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-ord",
|
||||
@@ -3982,7 +3983,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "lance-table"
|
||||
version = "0.25.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4022,7 +4023,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "lance-testing"
|
||||
version = "0.25.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-schema",
|
||||
|
||||
16
Cargo.toml
16
Cargo.toml
@@ -23,14 +23,14 @@ rust-version = "1.78.0"
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.25.0", "features" = [
|
||||
"dynamodb",
|
||||
], tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-io = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-index = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-linalg = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-table = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-testing = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-datafusion = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-encoding = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" }
|
||||
], tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-io = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-index = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-linalg = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-table = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-testing = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-datafusion = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
|
||||
lance-encoding = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "54.1", optional = false }
|
||||
arrow-array = "54.1"
|
||||
|
||||
@@ -30,6 +30,17 @@ The type of the index
|
||||
|
||||
***
|
||||
|
||||
### loss?
|
||||
|
||||
```ts
|
||||
optional loss: number;
|
||||
```
|
||||
|
||||
The KMeans loss value of the index,
|
||||
it is only present for vector indices.
|
||||
|
||||
***
|
||||
|
||||
### numIndexedRows
|
||||
|
||||
```ts
|
||||
|
||||
@@ -470,6 +470,8 @@ describe("When creating an index", () => {
|
||||
indexType: "IvfPq",
|
||||
columns: ["vec"],
|
||||
});
|
||||
const stats = await tbl.indexStats("vec_idx");
|
||||
expect(stats?.loss).toBeDefined();
|
||||
|
||||
// Search without specifying the column
|
||||
let rst = await tbl
|
||||
@@ -730,6 +732,7 @@ describe("When creating an index", () => {
|
||||
expect(stats?.distanceType).toBeUndefined();
|
||||
expect(stats?.indexType).toEqual("BTREE");
|
||||
expect(stats?.numIndices).toEqual(1);
|
||||
expect(stats?.loss).toBeUndefined();
|
||||
});
|
||||
|
||||
test("when getting stats on non-existent index", async () => {
|
||||
|
||||
@@ -498,6 +498,9 @@ pub struct IndexStatistics {
|
||||
pub distance_type: Option<String>,
|
||||
/// The number of parts this index is split into.
|
||||
pub num_indices: Option<u32>,
|
||||
/// The KMeans loss value of the index,
|
||||
/// it is only present for vector indices.
|
||||
pub loss: Option<f64>,
|
||||
}
|
||||
impl From<lancedb::index::IndexStatistics> for IndexStatistics {
|
||||
fn from(value: lancedb::index::IndexStatistics) -> Self {
|
||||
@@ -507,6 +510,7 @@ impl From<lancedb::index::IndexStatistics> for IndexStatistics {
|
||||
index_type: value.index_type.to_string(),
|
||||
distance_type: value.distance_type.map(|d| d.to_string()),
|
||||
num_indices: value.num_indices,
|
||||
loss: value.loss,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1185,6 +1185,7 @@ class Table(ABC):
|
||||
*,
|
||||
cleanup_older_than: Optional[timedelta] = None,
|
||||
delete_unverified: bool = False,
|
||||
retrain: bool = False,
|
||||
):
|
||||
"""
|
||||
Optimize the on-disk data and indices for better performance.
|
||||
@@ -1208,6 +1209,11 @@ class Table(ABC):
|
||||
in-progress operation (e.g. appending new data) and these files will not
|
||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||
then these files will be deleted regardless of their age.
|
||||
retrain: bool, default False
|
||||
If True, retrain the vector indices, this would refine the IVF clustering
|
||||
and quantization, which may improve the search accuracy. It's faster than
|
||||
re-creating the index from scratch, so it's recommended to try this first,
|
||||
when the data distribution has changed significantly.
|
||||
|
||||
Experimental API
|
||||
----------------
|
||||
@@ -2342,6 +2348,7 @@ class LanceTable(Table):
|
||||
*,
|
||||
cleanup_older_than: Optional[timedelta] = None,
|
||||
delete_unverified: bool = False,
|
||||
retrain: bool = False,
|
||||
):
|
||||
"""
|
||||
Optimize the on-disk data and indices for better performance.
|
||||
@@ -2365,6 +2372,11 @@ class LanceTable(Table):
|
||||
in-progress operation (e.g. appending new data) and these files will not
|
||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||
then these files will be deleted regardless of their age.
|
||||
retrain: bool, default False
|
||||
If True, retrain the vector indices, this would refine the IVF clustering
|
||||
and quantization, which may improve the search accuracy. It's faster than
|
||||
re-creating the index from scratch, so it's recommended to try this first,
|
||||
when the data distribution has changed significantly.
|
||||
|
||||
Experimental API
|
||||
----------------
|
||||
@@ -2388,6 +2400,7 @@ class LanceTable(Table):
|
||||
self._table.optimize(
|
||||
cleanup_older_than=cleanup_older_than,
|
||||
delete_unverified=delete_unverified,
|
||||
retrain=retrain,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -3590,6 +3603,7 @@ class AsyncTable:
|
||||
*,
|
||||
cleanup_older_than: Optional[timedelta] = None,
|
||||
delete_unverified: bool = False,
|
||||
retrain=False,
|
||||
) -> OptimizeStats:
|
||||
"""
|
||||
Optimize the on-disk data and indices for better performance.
|
||||
@@ -3613,6 +3627,11 @@ class AsyncTable:
|
||||
in-progress operation (e.g. appending new data) and these files will not
|
||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||
then these files will be deleted regardless of their age.
|
||||
retrain: bool, default False
|
||||
If True, retrain the vector indices, this would refine the IVF clustering
|
||||
and quantization, which may improve the search accuracy. It's faster than
|
||||
re-creating the index from scratch, so it's recommended to try this first,
|
||||
when the data distribution has changed significantly.
|
||||
|
||||
Experimental API
|
||||
----------------
|
||||
@@ -3636,7 +3655,9 @@ class AsyncTable:
|
||||
if cleanup_older_than is not None:
|
||||
cleanup_since_ms = round(cleanup_older_than.total_seconds() * 1000)
|
||||
return await self._inner.optimize(
|
||||
cleanup_since_ms=cleanup_since_ms, delete_unverified=delete_unverified
|
||||
cleanup_since_ms=cleanup_since_ms,
|
||||
delete_unverified=delete_unverified,
|
||||
retrain=retrain,
|
||||
)
|
||||
|
||||
async def list_indices(self) -> Iterable[IndexConfig]:
|
||||
@@ -3729,6 +3750,8 @@ class IndexStatistics:
|
||||
The distance type used by the index.
|
||||
num_indices: Optional[int]
|
||||
The number of parts the index is split into.
|
||||
loss: Optional[float]
|
||||
The KMeans loss for the index, for only vector indices.
|
||||
"""
|
||||
|
||||
num_indexed_rows: int
|
||||
@@ -3738,6 +3761,7 @@ class IndexStatistics:
|
||||
]
|
||||
distance_type: Optional[Literal["l2", "cosine", "dot"]] = None
|
||||
num_indices: Optional[int] = None
|
||||
loss: Optional[float] = None
|
||||
|
||||
# This exists for backwards compatibility with an older API, which returned
|
||||
# a dictionary instead of a class.
|
||||
|
||||
@@ -131,6 +131,7 @@ async def test_create_vector_index(some_table: AsyncTable):
|
||||
assert stats.num_indexed_rows == await some_table.count_rows()
|
||||
assert stats.num_unindexed_rows == 0
|
||||
assert stats.num_indices == 1
|
||||
assert stats.loss >= 0.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -154,6 +155,7 @@ async def test_create_4bit_ivfpq_index(some_table: AsyncTable):
|
||||
assert stats.num_indexed_rows == await some_table.count_rows()
|
||||
assert stats.num_unindexed_rows == 0
|
||||
assert stats.num_indices == 1
|
||||
assert stats.loss >= 0.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@@ -235,6 +235,10 @@ impl Table {
|
||||
dict.set_item("num_indices", num_indices)?;
|
||||
}
|
||||
|
||||
if let Some(loss) = stats.loss {
|
||||
dict.set_item("loss", loss)?;
|
||||
}
|
||||
|
||||
Ok(Some(dict.unbind()))
|
||||
})
|
||||
} else {
|
||||
@@ -312,11 +316,12 @@ impl Table {
|
||||
}
|
||||
|
||||
/// Optimize the on-disk data by compacting and pruning old data, for better performance.
|
||||
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None))]
|
||||
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None, retrain=None))]
|
||||
pub fn optimize(
|
||||
self_: PyRef<'_, Self>,
|
||||
cleanup_since_ms: Option<u64>,
|
||||
delete_unverified: Option<bool>,
|
||||
retrain: Option<bool>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
let older_than = if let Some(ms) = cleanup_since_ms {
|
||||
@@ -352,9 +357,10 @@ impl Table {
|
||||
.prune
|
||||
.unwrap();
|
||||
inner
|
||||
.optimize(lancedb::table::OptimizeAction::Index(
|
||||
OptimizeOptions::default(),
|
||||
))
|
||||
.optimize(lancedb::table::OptimizeAction::Index(match retrain {
|
||||
Some(true) => OptimizeOptions::retrain(),
|
||||
_ => OptimizeOptions::default(),
|
||||
}))
|
||||
.await
|
||||
.infer_error()?;
|
||||
Ok(OptimizeStats {
|
||||
|
||||
@@ -174,6 +174,7 @@ pub(crate) struct IndexMetadata {
|
||||
pub metric_type: Option<DistanceType>,
|
||||
// Sometimes the index type is provided at this level.
|
||||
pub index_type: Option<IndexType>,
|
||||
pub loss: Option<f64>,
|
||||
}
|
||||
|
||||
// This struct is used to deserialize the JSON data returned from the Lance API
|
||||
@@ -205,4 +206,6 @@ pub struct IndexStatistics {
|
||||
pub distance_type: Option<DistanceType>,
|
||||
/// The number of parts this index is split into.
|
||||
pub num_indices: Option<u32>,
|
||||
/// The loss value used by the index.
|
||||
pub loss: Option<f64>,
|
||||
}
|
||||
|
||||
@@ -1884,6 +1884,7 @@ mod tests {
|
||||
index_type: IndexType::IvfPq,
|
||||
distance_type: Some(DistanceType::L2),
|
||||
num_indices: None,
|
||||
loss: None,
|
||||
};
|
||||
assert_eq!(indices, expected);
|
||||
|
||||
|
||||
@@ -2373,12 +2373,20 @@ impl BaseTable for NativeTable {
|
||||
.ok_or_else(|| Error::InvalidInput {
|
||||
message: "index statistics was missing index type".to_string(),
|
||||
})?;
|
||||
let loss = stats
|
||||
.indices
|
||||
.iter()
|
||||
.map(|index| index.loss.unwrap_or_default())
|
||||
.sum::<f64>();
|
||||
|
||||
let loss = first_index.loss.map(|first_loss| first_loss + loss);
|
||||
Ok(Some(IndexStatistics {
|
||||
num_indexed_rows: stats.num_indexed_rows,
|
||||
num_unindexed_rows: stats.num_unindexed_rows,
|
||||
index_type,
|
||||
distance_type: first_index.metric_type,
|
||||
num_indices: stats.num_indices,
|
||||
loss,
|
||||
}))
|
||||
}
|
||||
}
|
||||
@@ -3045,6 +3053,7 @@ mod tests {
|
||||
assert_eq!(stats.num_unindexed_rows, 0);
|
||||
assert_eq!(stats.index_type, crate::index::IndexType::IvfPq);
|
||||
assert_eq!(stats.distance_type, Some(crate::DistanceType::L2));
|
||||
assert!(stats.loss.is_some());
|
||||
|
||||
table.drop_index(index_name).await.unwrap();
|
||||
assert_eq!(table.list_indices().await.unwrap().len(), 0);
|
||||
|
||||
Reference in New Issue
Block a user