From 4243eaee9362216437c0e906c3e141531d9cef5b Mon Sep 17 00:00:00 2001 From: Bert Date: Thu, 18 Jan 2024 23:44:22 -0500 Subject: [PATCH] bump lance to 0.9.7 (#826) --- Cargo.toml | 10 +++--- python/pyproject.toml | 2 +- rust/vectordb/Cargo.toml | 2 ++ rust/vectordb/src/index/vector.rs | 8 +++++ rust/vectordb/src/table.rs | 58 +++++++++++++++++++++++++++---- 5 files changed, 67 insertions(+), 13 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4e8482d2..9923268e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,10 +5,10 @@ exclude = ["python"] resolver = "2" [workspace.dependencies] -lance = { "version" = "=0.9.6", "features" = ["dynamodb"] } -lance-index = { "version" = "=0.9.6" } -lance-linalg = { "version" = "=0.9.6" } -lance-testing = { "version" = "=0.9.6" } +lance = { "version" = "=0.9.7", "features" = ["dynamodb"] } +lance-index = { "version" = "=0.9.7" } +lance-linalg = { "version" = "=0.9.7" } +lance-testing = { "version" = "=0.9.7" } # Note that this one does not include pyarrow arrow = { version = "49.0.0", optional = false } arrow-array = "49.0" @@ -23,6 +23,6 @@ half = { "version" = "=2.3.1", default-features = false, features = [ "num-traits", ] } log = "0.4" -object_store = "0.8.0" +object_store = "0.9.0" snafu = "0.7.4" url = "2" diff --git a/python/pyproject.toml b/python/pyproject.toml index 2a386fd9..bff33e0f 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -3,7 +3,7 @@ name = "lancedb" version = "0.5.0" dependencies = [ "deprecation", - "pylance==0.9.6", + "pylance==0.9.7", "ratelimiter~=1.0", "retry>=0.9.2", "tqdm>=4.27.0", diff --git a/rust/vectordb/Cargo.toml b/rust/vectordb/Cargo.toml index 63c73221..70e7c868 100644 --- a/rust/vectordb/Cargo.toml +++ b/rust/vectordb/Cargo.toml @@ -31,6 +31,8 @@ bytes = "1" futures = "0" num-traits = "0" url = { workspace = true } +serde = { version = "^1" } +serde_json = { version = "1" } [dev-dependencies] tempfile = "3.5.0" diff --git a/rust/vectordb/src/index/vector.rs b/rust/vectordb/src/index/vector.rs index 4017d635..418b43c4 100644 --- a/rust/vectordb/src/index/vector.rs +++ b/rust/vectordb/src/index/vector.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use serde::Deserialize; + use lance::format::{Index, Manifest}; use lance::index::vector::pq::PQBuildParams; use lance::index::vector::VectorIndexParams; @@ -132,6 +134,12 @@ impl VectorIndex { } } +#[derive(Debug, Deserialize)] +pub struct VectorIndexStatistics { + pub num_indexed_rows: usize, + pub num_unindexed_rows: usize, +} + #[cfg(test)] mod tests { use super::*; diff --git a/rust/vectordb/src/table.rs b/rust/vectordb/src/table.rs index fba001a1..0c370092 100644 --- a/rust/vectordb/src/table.rs +++ b/rust/vectordb/src/table.rs @@ -15,6 +15,7 @@ use chrono::Duration; use lance::dataset::builder::DatasetBuilder; use lance::index::scalar::ScalarIndexParams; +use lance_index::optimize::OptimizeOptions; use lance_index::IndexType; use std::sync::Arc; @@ -25,12 +26,12 @@ use lance::dataset::optimize::{ compact_files, CompactionMetrics, CompactionOptions, IndexRemapperOptions, }; use lance::dataset::{Dataset, UpdateBuilder, WriteParams}; -use lance::index::DatasetIndexExt; use lance::io::object_store::WrappingObjectStore; +use lance_index::DatasetIndexExt; use std::path::Path; use crate::error::{Error, Result}; -use crate::index::vector::{VectorIndex, VectorIndexBuilder}; +use crate::index::vector::{VectorIndex, VectorIndexBuilder, VectorIndexStatistics}; use crate::query::Query; use crate::utils::{PatchReadParam, PatchWriteParam}; use crate::WriteMode; @@ -273,10 +274,9 @@ impl Table { Ok(()) } - pub async fn optimize_indices(&mut self) -> Result<()> { + pub async fn optimize_indices(&mut self, options: &OptimizeOptions) -> Result<()> { let mut dataset = self.dataset.as_ref().clone(); - - dataset.optimize_indices().await?; + dataset.optimize_indices(options).await?; Ok(()) } @@ -426,11 +426,17 @@ impl Table { } pub async fn count_indexed_rows(&self, index_uuid: &str) -> Result> { - Ok(self.dataset.count_indexed_rows(index_uuid).await?) + match self.load_index_stats(index_uuid).await? { + Some(stats) => Ok(Some(stats.num_indexed_rows)), + None => Ok(None), + } } pub async fn count_unindexed_rows(&self, index_uuid: &str) -> Result> { - Ok(self.dataset.count_unindexed_rows(index_uuid).await?) + match self.load_index_stats(index_uuid).await? { + Some(stats) => Ok(Some(stats.num_unindexed_rows)), + None => Ok(None), + } } pub async fn load_indices(&self) -> Result> { @@ -441,6 +447,30 @@ impl Table { .map(|i| VectorIndex::new_from_format(&mf, i)) .collect()) } + + async fn load_index_stats(&self, index_uuid: &str) -> Result> { + let index = self + .load_indices() + .await? + .into_iter() + .find(|i| i.index_uuid == index_uuid); + if index.is_none() { + return Ok(None); + } + let index_stats = self + .dataset + .index_statistics(&index.unwrap().index_name) + .await?; + let index_stats: VectorIndexStatistics = + serde_json::from_str(&index_stats).map_err(|e| Error::Lance { + message: format!( + "error deserializing index statistics {}: {}", + e, index_stats + ), + })?; + + Ok(Some(index_stats)) + } } #[cfg(test)] @@ -963,6 +993,9 @@ mod tests { .unwrap(); let mut i = IvfPQIndexBuilder::new(); + assert_eq!(table.count_indexed_rows("my_index").await.unwrap(), None); + assert_eq!(table.count_unindexed_rows("my_index").await.unwrap(), None); + let index_builder = i .column("embeddings".to_string()) .index_name("my_index".to_string()) @@ -974,6 +1007,17 @@ mod tests { assert_eq!(table.dataset.load_indices().await.unwrap().len(), 1); assert_eq!(table.count_rows().await.unwrap(), 512); assert_eq!(table.name, "test"); + + let indices = table.load_indices().await.unwrap(); + let index_uuid = &indices[0].index_uuid; + assert_eq!( + table.count_indexed_rows(index_uuid).await.unwrap(), + Some(512) + ); + assert_eq!( + table.count_unindexed_rows(index_uuid).await.unwrap(), + Some(0) + ); } fn create_fixed_size_list(values: T, list_size: i32) -> Result {