feat: remote index stats (#1702)

BREAKING CHANGE: the return value of `index_stats` method has changed
and all `index_stats` APIs now take index name instead of UUID. Also
several deprecated index statistics methods were removed.

* Removes deprecated methods for individual index statistics
* Aligns public `IndexStatistics` struct with API response from LanceDB
Cloud.
* Implements `index_stats` for remote Rust SDK and Python async API.
This commit is contained in:
Will Jones
2024-09-27 12:10:00 -07:00
committed by GitHub
parent c1d9d6f70b
commit f958f4d2e8
16 changed files with 337 additions and 307 deletions

View File

@@ -2683,6 +2683,26 @@ class AsyncTable:
"""
return await self._inner.list_indices()
async def index_stats(self, index_name: str) -> Optional[IndexStatistics]:
"""
Retrieve statistics about an index
Parameters
----------
index_name: str
The name of the index to retrieve statistics for
Returns
-------
IndexStatistics or None
The statistics about the index. Returns None if the index does not exist.
"""
stats = await self._inner.index_stats(index_name)
if stats is None:
return None
else:
return IndexStatistics(**stats)
async def uses_v2_manifest_paths(self) -> bool:
"""
Check if the table is using the new v2 manifest paths.
@@ -2713,3 +2733,31 @@ class AsyncTable:
to check if the table is already using the new path style.
"""
await self._inner.migrate_manifest_paths_v2()
@dataclass
class IndexStatistics:
"""
Statistics about an index.
Attributes
----------
num_indexed_rows: int
The number of rows that are covered by this index.
num_unindexed_rows: int
The number of rows that are not covered by this index.
index_type: str
The type of index that was created.
distance_type: Optional[str]
The distance type used by the index.
num_indices: Optional[int]
The number of parts the index is split into.
"""
num_indexed_rows: int
num_unindexed_rows: int
index_type: Literal[
"IVF_PQ", "IVF_HNSW_PQ", "IVF_HNSW_SQ", "FTS", "BTREE", "BITMAP", "LABEL_LIST"
]
distance_type: Optional[Literal["l2", "cosine", "dot"]] = None
num_indices: Optional[int] = None

View File

@@ -66,6 +66,15 @@ async def test_create_bitmap_index(some_table: AsyncTable):
# TODO: Fix via https://github.com/lancedb/lance/issues/2039
# indices = await some_table.list_indices()
# assert str(indices) == '[Index(Bitmap, columns=["id"])]'
indices = await some_table.list_indices()
assert len(indices) == 1
index_name = indices[0].name
stats = await some_table.index_stats(index_name)
assert stats.index_type == "BITMAP"
assert stats.distance_type is None
assert stats.num_indexed_rows == await some_table.count_rows()
assert stats.num_unindexed_rows == 0
assert stats.num_indices == 1
@pytest.mark.asyncio
@@ -91,6 +100,14 @@ async def test_create_vector_index(some_table: AsyncTable):
assert len(indices) == 1
assert indices[0].index_type == "IvfPq"
assert indices[0].columns == ["vector"]
assert indices[0].name == "vector_idx"
stats = await some_table.index_stats("vector_idx")
assert stats.index_type == "IVF_PQ"
assert stats.distance_type == "l2"
assert stats.num_indexed_rows == await some_table.count_rows()
assert stats.num_unindexed_rows == 0
assert stats.num_indices == 1
@pytest.mark.asyncio

View File

@@ -200,6 +200,8 @@ pub struct IndexConfig {
/// Currently this is always a list of size 1. In the future there may
/// be more columns to represent composite indices.
pub columns: Vec<String>,
/// Name of the index.
pub name: String,
}
#[pymethods]
@@ -215,6 +217,7 @@ impl From<lancedb::index::IndexConfig> for IndexConfig {
Self {
index_type,
columns: value.columns,
name: value.name,
}
}
}

View File

@@ -8,8 +8,8 @@ use lancedb::table::{
use pyo3::{
exceptions::{PyRuntimeError, PyValueError},
pyclass, pymethods,
types::{PyDict, PyString},
Bound, PyAny, PyRef, PyResult, Python,
types::{PyDict, PyDictMethods, PyString},
Bound, PyAny, PyRef, PyResult, Python, ToPyObject,
};
use pyo3_asyncio_0_21::tokio::future_into_py;
@@ -204,6 +204,33 @@ impl Table {
})
}
pub fn index_stats(self_: PyRef<'_, Self>, index_name: String) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {
let stats = inner.index_stats(&index_name).await.infer_error()?;
if let Some(stats) = stats {
Python::with_gil(|py| {
let dict = PyDict::new_bound(py);
dict.set_item("num_indexed_rows", stats.num_indexed_rows)?;
dict.set_item("num_unindexed_rows", stats.num_unindexed_rows)?;
dict.set_item("index_type", stats.index_type.to_string())?;
if let Some(distance_type) = stats.distance_type {
dict.set_item("distance_type", distance_type.to_string())?;
}
if let Some(num_indices) = stats.num_indices {
dict.set_item("num_indices", num_indices)?;
}
Ok(Some(dict.to_object(py)))
})
} else {
Ok(None)
}
})
}
pub fn __repr__(&self) -> String {
match &self.inner {
None => format!("ClosedTable({})", self.name),