From f5e9c073f043cc5f89e25bc3690f1de7905f120e Mon Sep 17 00:00:00 2001 From: Bert Date: Thu, 26 Oct 2023 13:10:17 -0400 Subject: [PATCH] feat: added data stats apis (#596) --- Cargo.toml | 8 ++++---- rust/vectordb/src/index/vector.rs | 22 ++++++++++++++++++++++ rust/vectordb/src/table.rs | 30 +++++++++++++++++++++++++++++- 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 77e7bc8a..046b3f63 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,9 +5,9 @@ exclude = ["python"] resolver = "2" [workspace.dependencies] -lance = { "version" = "=0.8.7", "features" = ["dynamodb"] } -lance-linalg = { "version" = "=0.8.7" } -lance-testing = { "version" = "=0.8.7" } +lance = { "version" = "=0.8.8", "features" = ["dynamodb"] } +lance-linalg = { "version" = "=0.8.8" } +lance-testing = { "version" = "=0.8.8" } # Note that this one does not include pyarrow arrow = { version = "47.0.0", optional = false } arrow-array = "47.0" @@ -19,7 +19,7 @@ arrow-arith = "47.0" arrow-cast = "47.0" chrono = "0.4.23" half = { "version" = "=2.3.1", default-features = false, features = [ - "num-traits" + "num-traits", ] } log = "0.4" object_store = "0.7.1" diff --git a/rust/vectordb/src/index/vector.rs b/rust/vectordb/src/index/vector.rs index 88f0fed1..dd327c09 100644 --- a/rust/vectordb/src/index/vector.rs +++ b/rust/vectordb/src/index/vector.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use lance::format::{Index, Manifest}; use lance::index::vector::ivf::IvfBuildParams; use lance::index::vector::pq::PQBuildParams; use lance::index::vector::VectorIndexParams; @@ -106,6 +107,27 @@ impl VectorIndexBuilder for IvfPQIndexBuilder { } } +pub struct VectorIndex { + pub columns: Vec, + pub index_name: String, + pub index_uuid: String, +} + +impl VectorIndex { + pub fn new_from_format(manifest: &Manifest, index: &Index) -> VectorIndex { + let fields = index + .fields + .iter() + .map(|i| manifest.schema.fields[*i as usize].name.clone()) + .collect(); + VectorIndex { + columns: fields, + index_name: index.name.clone(), + index_uuid: index.uuid.to_string(), + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/rust/vectordb/src/table.rs b/rust/vectordb/src/table.rs index b7c40e87..3a22c014 100644 --- a/rust/vectordb/src/table.rs +++ b/rust/vectordb/src/table.rs @@ -27,7 +27,7 @@ use lance::io::object_store::WrappingObjectStore; use std::path::Path; use crate::error::{Error, Result}; -use crate::index::vector::VectorIndexBuilder; +use crate::index::vector::{VectorIndexBuilder, VectorIndex}; use crate::query::Query; use crate::utils::{PatchReadParam, PatchWriteParam}; use crate::WriteMode; @@ -371,6 +371,34 @@ impl Table { self.dataset = Arc::new(dataset); Ok(metrics) } + + pub fn count_fragments(&self) -> usize { + self.dataset.count_fragments() + } + + pub fn count_deleted_rows(&self) -> usize { + self.dataset.count_deleted_rows() + } + + pub fn num_small_files(&self, max_rows_per_group: usize) -> usize { + self.dataset.num_small_files(max_rows_per_group) + } + + pub async fn count_indexed_rows(&self, index_uuid: &str) -> Result> { + Ok(self.dataset.count_indexed_rows(index_uuid).await?) + } + + pub async fn count_unindexed_rows(&self, index_uuid: &str) -> Result> { + Ok(self.dataset.count_unindexed_rows(index_uuid).await?) + } + + pub async fn load_indices(&self) -> Result> { + let (indices, mf) = futures::try_join!( + self.dataset.load_indices(), + self.dataset.latest_manifest() + )?; + Ok(indices.iter().map(|i| VectorIndex::new_from_format(&mf, i)).collect()) + } } #[cfg(test)]