From 7ff6ec7fe31addf216f76ba079576ab35183e8fd Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Sat, 22 Mar 2025 01:12:23 +0800 Subject: [PATCH] feat: upgrade to lance v0.25.0-beta.5 (#2248) - adds `loss` into the index stats for vector index - now `optimize` can retrain the vector index --------- Signed-off-by: BubbleCal --- Cargo.lock | 25 +++++++++++----------- Cargo.toml | 16 +++++++------- docs/src/js/interfaces/IndexStatistics.md | 11 ++++++++++ nodejs/__test__/table.test.ts | 3 +++ nodejs/src/table.rs | 4 ++++ python/python/lancedb/table.py | 26 ++++++++++++++++++++++- python/python/tests/test_index.py | 2 ++ python/src/table.rs | 14 ++++++++---- rust/lancedb/src/index.rs | 3 +++ rust/lancedb/src/remote/table.rs | 1 + rust/lancedb/src/table.rs | 9 ++++++++ 11 files changed, 89 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c1f34c0c..17658e3b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2673,7 +2673,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" version = "0.25.0" -source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64" +source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155" dependencies = [ "rand 0.8.5", ] @@ -3651,7 +3651,7 @@ dependencies = [ [[package]] name = "lance" version = "0.25.0" -source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64" +source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155" dependencies = [ "arrow", "arrow-arith", @@ -3711,7 +3711,7 @@ dependencies = [ [[package]] name = "lance-arrow" version = "0.25.0" -source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64" +source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155" dependencies = [ "arrow-array", "arrow-buffer", @@ -3729,7 +3729,7 @@ dependencies = [ [[package]] name = "lance-core" version = "0.25.0" -source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64" +source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155" dependencies = [ "arrow-array", "arrow-buffer", @@ -3766,7 +3766,7 @@ dependencies = [ [[package]] name = "lance-datafusion" version = "0.25.0" -source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64" +source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155" dependencies = [ "arrow", "arrow-array", @@ -3787,12 +3787,13 @@ dependencies = [ "prost", "snafu", "tokio", + "tracing", ] [[package]] name = "lance-encoding" version = "0.25.0" -source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64" +source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155" dependencies = [ "arrayref", "arrow", @@ -3831,7 +3832,7 @@ dependencies = [ [[package]] name = "lance-file" version = "0.25.0" -source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64" +source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155" dependencies = [ "arrow-arith", "arrow-array", @@ -3866,7 +3867,7 @@ dependencies = [ [[package]] name = "lance-index" version = "0.25.0" -source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64" +source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155" dependencies = [ "arrow", "arrow-array", @@ -3919,7 +3920,7 @@ dependencies = [ [[package]] name = "lance-io" version = "0.25.0" -source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64" +source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155" dependencies = [ "arrow", "arrow-arith", @@ -3958,7 +3959,7 @@ dependencies = [ [[package]] name = "lance-linalg" version = "0.25.0" -source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64" +source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155" dependencies = [ "arrow-array", "arrow-ord", @@ -3982,7 +3983,7 @@ dependencies = [ [[package]] name = "lance-table" version = "0.25.0" -source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64" +source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155" dependencies = [ "arrow", "arrow-array", @@ -4022,7 +4023,7 @@ dependencies = [ [[package]] name = "lance-testing" version = "0.25.0" -source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.2#15420d5437c3232e87c7d004932cbca3294ece64" +source = "git+https://github.com/lancedb/lance.git?tag=v0.25.0-beta.5#f02095ddb4a57f15769e028919f343e38866d155" dependencies = [ "arrow-array", "arrow-schema", diff --git a/Cargo.toml b/Cargo.toml index d91fef19..97a351c5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,14 +23,14 @@ rust-version = "1.78.0" [workspace.dependencies] lance = { "version" = "=0.25.0", "features" = [ "dynamodb", -], tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" } -lance-io = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" } -lance-index = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" } -lance-linalg = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" } -lance-table = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" } -lance-testing = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" } -lance-datafusion = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" } -lance-encoding = { version = "=0.25.0", tag = "v0.25.0-beta.2", git = "https://github.com/lancedb/lance.git" } +], tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" } +lance-io = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" } +lance-index = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" } +lance-linalg = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" } +lance-table = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" } +lance-testing = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" } +lance-datafusion = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" } +lance-encoding = { version = "=0.25.0", tag = "v0.25.0-beta.5", git = "https://github.com/lancedb/lance.git" } # Note that this one does not include pyarrow arrow = { version = "54.1", optional = false } arrow-array = "54.1" diff --git a/docs/src/js/interfaces/IndexStatistics.md b/docs/src/js/interfaces/IndexStatistics.md index 3b1d8ccf..e9abf7ad 100644 --- a/docs/src/js/interfaces/IndexStatistics.md +++ b/docs/src/js/interfaces/IndexStatistics.md @@ -30,6 +30,17 @@ The type of the index *** +### loss? + +```ts +optional loss: number; +``` + +The KMeans loss value of the index, +it is only present for vector indices. + +*** + ### numIndexedRows ```ts diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts index 966e4050..821fa983 100644 --- a/nodejs/__test__/table.test.ts +++ b/nodejs/__test__/table.test.ts @@ -470,6 +470,8 @@ describe("When creating an index", () => { indexType: "IvfPq", columns: ["vec"], }); + const stats = await tbl.indexStats("vec_idx"); + expect(stats?.loss).toBeDefined(); // Search without specifying the column let rst = await tbl @@ -730,6 +732,7 @@ describe("When creating an index", () => { expect(stats?.distanceType).toBeUndefined(); expect(stats?.indexType).toEqual("BTREE"); expect(stats?.numIndices).toEqual(1); + expect(stats?.loss).toBeUndefined(); }); test("when getting stats on non-existent index", async () => { diff --git a/nodejs/src/table.rs b/nodejs/src/table.rs index d70640e6..abaccb36 100644 --- a/nodejs/src/table.rs +++ b/nodejs/src/table.rs @@ -498,6 +498,9 @@ pub struct IndexStatistics { pub distance_type: Option, /// The number of parts this index is split into. pub num_indices: Option, + /// The KMeans loss value of the index, + /// it is only present for vector indices. + pub loss: Option, } impl From for IndexStatistics { fn from(value: lancedb::index::IndexStatistics) -> Self { @@ -507,6 +510,7 @@ impl From for IndexStatistics { index_type: value.index_type.to_string(), distance_type: value.distance_type.map(|d| d.to_string()), num_indices: value.num_indices, + loss: value.loss, } } } diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index b11b6d9f..aebc2c82 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -1185,6 +1185,7 @@ class Table(ABC): *, cleanup_older_than: Optional[timedelta] = None, delete_unverified: bool = False, + retrain: bool = False, ): """ Optimize the on-disk data and indices for better performance. @@ -1208,6 +1209,11 @@ class Table(ABC): in-progress operation (e.g. appending new data) and these files will not be deleted unless they are at least 7 days old. If delete_unverified is True then these files will be deleted regardless of their age. + retrain: bool, default False + If True, retrain the vector indices, this would refine the IVF clustering + and quantization, which may improve the search accuracy. It's faster than + re-creating the index from scratch, so it's recommended to try this first, + when the data distribution has changed significantly. Experimental API ---------------- @@ -2342,6 +2348,7 @@ class LanceTable(Table): *, cleanup_older_than: Optional[timedelta] = None, delete_unverified: bool = False, + retrain: bool = False, ): """ Optimize the on-disk data and indices for better performance. @@ -2365,6 +2372,11 @@ class LanceTable(Table): in-progress operation (e.g. appending new data) and these files will not be deleted unless they are at least 7 days old. If delete_unverified is True then these files will be deleted regardless of their age. + retrain: bool, default False + If True, retrain the vector indices, this would refine the IVF clustering + and quantization, which may improve the search accuracy. It's faster than + re-creating the index from scratch, so it's recommended to try this first, + when the data distribution has changed significantly. Experimental API ---------------- @@ -2388,6 +2400,7 @@ class LanceTable(Table): self._table.optimize( cleanup_older_than=cleanup_older_than, delete_unverified=delete_unverified, + retrain=retrain, ) ) @@ -3590,6 +3603,7 @@ class AsyncTable: *, cleanup_older_than: Optional[timedelta] = None, delete_unverified: bool = False, + retrain=False, ) -> OptimizeStats: """ Optimize the on-disk data and indices for better performance. @@ -3613,6 +3627,11 @@ class AsyncTable: in-progress operation (e.g. appending new data) and these files will not be deleted unless they are at least 7 days old. If delete_unverified is True then these files will be deleted regardless of their age. + retrain: bool, default False + If True, retrain the vector indices, this would refine the IVF clustering + and quantization, which may improve the search accuracy. It's faster than + re-creating the index from scratch, so it's recommended to try this first, + when the data distribution has changed significantly. Experimental API ---------------- @@ -3636,7 +3655,9 @@ class AsyncTable: if cleanup_older_than is not None: cleanup_since_ms = round(cleanup_older_than.total_seconds() * 1000) return await self._inner.optimize( - cleanup_since_ms=cleanup_since_ms, delete_unverified=delete_unverified + cleanup_since_ms=cleanup_since_ms, + delete_unverified=delete_unverified, + retrain=retrain, ) async def list_indices(self) -> Iterable[IndexConfig]: @@ -3729,6 +3750,8 @@ class IndexStatistics: The distance type used by the index. num_indices: Optional[int] The number of parts the index is split into. + loss: Optional[float] + The KMeans loss for the index, for only vector indices. """ num_indexed_rows: int @@ -3738,6 +3761,7 @@ class IndexStatistics: ] distance_type: Optional[Literal["l2", "cosine", "dot"]] = None num_indices: Optional[int] = None + loss: Optional[float] = None # This exists for backwards compatibility with an older API, which returned # a dictionary instead of a class. diff --git a/python/python/tests/test_index.py b/python/python/tests/test_index.py index aefecc79..7fcd4e1e 100644 --- a/python/python/tests/test_index.py +++ b/python/python/tests/test_index.py @@ -131,6 +131,7 @@ async def test_create_vector_index(some_table: AsyncTable): assert stats.num_indexed_rows == await some_table.count_rows() assert stats.num_unindexed_rows == 0 assert stats.num_indices == 1 + assert stats.loss >= 0.0 @pytest.mark.asyncio @@ -154,6 +155,7 @@ async def test_create_4bit_ivfpq_index(some_table: AsyncTable): assert stats.num_indexed_rows == await some_table.count_rows() assert stats.num_unindexed_rows == 0 assert stats.num_indices == 1 + assert stats.loss >= 0.0 @pytest.mark.asyncio diff --git a/python/src/table.rs b/python/src/table.rs index 73a91dcc..08e83c86 100644 --- a/python/src/table.rs +++ b/python/src/table.rs @@ -235,6 +235,10 @@ impl Table { dict.set_item("num_indices", num_indices)?; } + if let Some(loss) = stats.loss { + dict.set_item("loss", loss)?; + } + Ok(Some(dict.unbind())) }) } else { @@ -312,11 +316,12 @@ impl Table { } /// Optimize the on-disk data by compacting and pruning old data, for better performance. - #[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None))] + #[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None, retrain=None))] pub fn optimize( self_: PyRef<'_, Self>, cleanup_since_ms: Option, delete_unverified: Option, + retrain: Option, ) -> PyResult> { let inner = self_.inner_ref()?.clone(); let older_than = if let Some(ms) = cleanup_since_ms { @@ -352,9 +357,10 @@ impl Table { .prune .unwrap(); inner - .optimize(lancedb::table::OptimizeAction::Index( - OptimizeOptions::default(), - )) + .optimize(lancedb::table::OptimizeAction::Index(match retrain { + Some(true) => OptimizeOptions::retrain(), + _ => OptimizeOptions::default(), + })) .await .infer_error()?; Ok(OptimizeStats { diff --git a/rust/lancedb/src/index.rs b/rust/lancedb/src/index.rs index efcef35f..60bb31c9 100644 --- a/rust/lancedb/src/index.rs +++ b/rust/lancedb/src/index.rs @@ -174,6 +174,7 @@ pub(crate) struct IndexMetadata { pub metric_type: Option, // Sometimes the index type is provided at this level. pub index_type: Option, + pub loss: Option, } // This struct is used to deserialize the JSON data returned from the Lance API @@ -205,4 +206,6 @@ pub struct IndexStatistics { pub distance_type: Option, /// The number of parts this index is split into. pub num_indices: Option, + /// The loss value used by the index. + pub loss: Option, } diff --git a/rust/lancedb/src/remote/table.rs b/rust/lancedb/src/remote/table.rs index b11dd496..8a83e39b 100644 --- a/rust/lancedb/src/remote/table.rs +++ b/rust/lancedb/src/remote/table.rs @@ -1884,6 +1884,7 @@ mod tests { index_type: IndexType::IvfPq, distance_type: Some(DistanceType::L2), num_indices: None, + loss: None, }; assert_eq!(indices, expected); diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index 692b5d75..c1fb5415 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -2373,12 +2373,20 @@ impl BaseTable for NativeTable { .ok_or_else(|| Error::InvalidInput { message: "index statistics was missing index type".to_string(), })?; + let loss = stats + .indices + .iter() + .map(|index| index.loss.unwrap_or_default()) + .sum::(); + + let loss = first_index.loss.map(|first_loss| first_loss + loss); Ok(Some(IndexStatistics { num_indexed_rows: stats.num_indexed_rows, num_unindexed_rows: stats.num_unindexed_rows, index_type, distance_type: first_index.metric_type, num_indices: stats.num_indices, + loss, })) } } @@ -3045,6 +3053,7 @@ mod tests { assert_eq!(stats.num_unindexed_rows, 0); assert_eq!(stats.index_type, crate::index::IndexType::IvfPq); assert_eq!(stats.distance_type, Some(crate::DistanceType::L2)); + assert!(stats.loss.is_some()); table.drop_index(index_name).await.unwrap(); assert_eq!(table.list_indices().await.unwrap().len(), 0);