mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-05 19:32:56 +00:00
feat: remote index stats (#1702)
BREAKING CHANGE: the return value of `index_stats` method has changed and all `index_stats` APIs now take index name instead of UUID. Also several deprecated index statistics methods were removed. * Removes deprecated methods for individual index statistics * Aligns public `IndexStatistics` struct with API response from LanceDB Cloud. * Implements `index_stats` for remote Rust SDK and Python async API.
This commit is contained in:
@@ -724,9 +724,9 @@ export interface VectorIndex {
|
||||
export interface IndexStats {
|
||||
numIndexedRows: number | null
|
||||
numUnindexedRows: number | null
|
||||
indexType: string | null
|
||||
distanceType: string | null
|
||||
completedAt: string | null
|
||||
indexType: string
|
||||
distanceType?: string
|
||||
numIndices?: number
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -112,7 +112,7 @@ export class Query<T = number[]> {
|
||||
return this
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Skip searching un-indexed data. This can make search faster, but will miss
|
||||
* any data that is not yet indexed.
|
||||
*/
|
||||
|
||||
@@ -17,7 +17,7 @@ import axios, { type AxiosResponse, type ResponseType } from 'axios'
|
||||
import { tableFromIPC, type Table as ArrowTable } from 'apache-arrow'
|
||||
|
||||
import { type RemoteResponse, type RemoteRequest, Method } from '../middleware'
|
||||
import { MetricType } from '..'
|
||||
import type { MetricType } from '..'
|
||||
|
||||
interface HttpLancedbClientMiddleware {
|
||||
onRemoteRequest(
|
||||
|
||||
@@ -526,8 +526,7 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
||||
numIndexedRows: body?.num_indexed_rows,
|
||||
numUnindexedRows: body?.num_unindexed_rows,
|
||||
indexType: body?.index_type,
|
||||
distanceType: body?.distance_type,
|
||||
completedAt: body?.completed_at
|
||||
distanceType: body?.distance_type
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -888,9 +888,12 @@ describe("LanceDB client", function () {
|
||||
expect(indices[0].columns).to.have.lengthOf(1);
|
||||
expect(indices[0].columns[0]).to.equal("vector");
|
||||
|
||||
const stats = await table.indexStats(indices[0].uuid);
|
||||
const stats = await table.indexStats(indices[0].name);
|
||||
expect(stats.numIndexedRows).to.equal(300);
|
||||
expect(stats.numUnindexedRows).to.equal(0);
|
||||
expect(stats.indexType).to.equal("IVF_PQ");
|
||||
expect(stats.distanceType).to.equal("l2");
|
||||
expect(stats.numIndices).to.equal(1);
|
||||
}).timeout(50_000);
|
||||
});
|
||||
|
||||
|
||||
@@ -479,6 +479,9 @@ describe("When creating an index", () => {
|
||||
expect(stats).toBeDefined();
|
||||
expect(stats?.numIndexedRows).toEqual(300);
|
||||
expect(stats?.numUnindexedRows).toEqual(0);
|
||||
expect(stats?.distanceType).toBeUndefined();
|
||||
expect(stats?.indexType).toEqual("BTREE");
|
||||
expect(stats?.numIndices).toEqual(1);
|
||||
});
|
||||
|
||||
test("when getting stats on non-existent index", async () => {
|
||||
|
||||
@@ -32,7 +32,6 @@ export {
|
||||
ColumnAlteration,
|
||||
ConnectionOptions,
|
||||
IndexStatistics,
|
||||
IndexMetadata,
|
||||
IndexConfig,
|
||||
} from "./native.js";
|
||||
|
||||
|
||||
@@ -337,7 +337,7 @@ impl Table {
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn index_stats(&self, index_name: String) -> napi::Result<Option<IndexStatistics>> {
|
||||
let tbl = self.inner_ref()?.as_native().unwrap();
|
||||
let tbl = self.inner_ref()?;
|
||||
let stats = tbl.index_stats(&index_name).await.default_error()?;
|
||||
Ok(stats.map(IndexStatistics::from))
|
||||
}
|
||||
@@ -480,32 +480,22 @@ pub struct IndexStatistics {
|
||||
/// The number of rows not indexed
|
||||
pub num_unindexed_rows: f64,
|
||||
/// The type of the index
|
||||
pub index_type: Option<String>,
|
||||
/// The metadata for each index
|
||||
pub indices: Vec<IndexMetadata>,
|
||||
pub index_type: String,
|
||||
/// The type of the distance function used by the index. This is only
|
||||
/// present for vector indices. Scalar and full text search indices do
|
||||
/// not have a distance function.
|
||||
pub distance_type: Option<String>,
|
||||
/// The number of parts this index is split into.
|
||||
pub num_indices: Option<u32>,
|
||||
}
|
||||
impl From<lancedb::index::IndexStatistics> for IndexStatistics {
|
||||
fn from(value: lancedb::index::IndexStatistics) -> Self {
|
||||
Self {
|
||||
num_indexed_rows: value.num_indexed_rows as f64,
|
||||
num_unindexed_rows: value.num_unindexed_rows as f64,
|
||||
index_type: value.index_type.map(|t| format!("{:?}", t)),
|
||||
indices: value.indices.into_iter().map(Into::into).collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct IndexMetadata {
|
||||
pub metric_type: Option<String>,
|
||||
pub index_type: Option<String>,
|
||||
}
|
||||
|
||||
impl From<lancedb::index::IndexMetadata> for IndexMetadata {
|
||||
fn from(value: lancedb::index::IndexMetadata) -> Self {
|
||||
Self {
|
||||
metric_type: value.metric_type,
|
||||
index_type: value.index_type,
|
||||
index_type: value.index_type.to_string(),
|
||||
distance_type: value.distance_type.map(|d| d.to_string()),
|
||||
num_indices: value.num_indices,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2683,6 +2683,26 @@ class AsyncTable:
|
||||
"""
|
||||
return await self._inner.list_indices()
|
||||
|
||||
async def index_stats(self, index_name: str) -> Optional[IndexStatistics]:
|
||||
"""
|
||||
Retrieve statistics about an index
|
||||
|
||||
Parameters
|
||||
----------
|
||||
index_name: str
|
||||
The name of the index to retrieve statistics for
|
||||
|
||||
Returns
|
||||
-------
|
||||
IndexStatistics or None
|
||||
The statistics about the index. Returns None if the index does not exist.
|
||||
"""
|
||||
stats = await self._inner.index_stats(index_name)
|
||||
if stats is None:
|
||||
return None
|
||||
else:
|
||||
return IndexStatistics(**stats)
|
||||
|
||||
async def uses_v2_manifest_paths(self) -> bool:
|
||||
"""
|
||||
Check if the table is using the new v2 manifest paths.
|
||||
@@ -2713,3 +2733,31 @@ class AsyncTable:
|
||||
to check if the table is already using the new path style.
|
||||
"""
|
||||
await self._inner.migrate_manifest_paths_v2()
|
||||
|
||||
|
||||
@dataclass
|
||||
class IndexStatistics:
|
||||
"""
|
||||
Statistics about an index.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
num_indexed_rows: int
|
||||
The number of rows that are covered by this index.
|
||||
num_unindexed_rows: int
|
||||
The number of rows that are not covered by this index.
|
||||
index_type: str
|
||||
The type of index that was created.
|
||||
distance_type: Optional[str]
|
||||
The distance type used by the index.
|
||||
num_indices: Optional[int]
|
||||
The number of parts the index is split into.
|
||||
"""
|
||||
|
||||
num_indexed_rows: int
|
||||
num_unindexed_rows: int
|
||||
index_type: Literal[
|
||||
"IVF_PQ", "IVF_HNSW_PQ", "IVF_HNSW_SQ", "FTS", "BTREE", "BITMAP", "LABEL_LIST"
|
||||
]
|
||||
distance_type: Optional[Literal["l2", "cosine", "dot"]] = None
|
||||
num_indices: Optional[int] = None
|
||||
|
||||
@@ -66,6 +66,15 @@ async def test_create_bitmap_index(some_table: AsyncTable):
|
||||
# TODO: Fix via https://github.com/lancedb/lance/issues/2039
|
||||
# indices = await some_table.list_indices()
|
||||
# assert str(indices) == '[Index(Bitmap, columns=["id"])]'
|
||||
indices = await some_table.list_indices()
|
||||
assert len(indices) == 1
|
||||
index_name = indices[0].name
|
||||
stats = await some_table.index_stats(index_name)
|
||||
assert stats.index_type == "BITMAP"
|
||||
assert stats.distance_type is None
|
||||
assert stats.num_indexed_rows == await some_table.count_rows()
|
||||
assert stats.num_unindexed_rows == 0
|
||||
assert stats.num_indices == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -91,6 +100,14 @@ async def test_create_vector_index(some_table: AsyncTable):
|
||||
assert len(indices) == 1
|
||||
assert indices[0].index_type == "IvfPq"
|
||||
assert indices[0].columns == ["vector"]
|
||||
assert indices[0].name == "vector_idx"
|
||||
|
||||
stats = await some_table.index_stats("vector_idx")
|
||||
assert stats.index_type == "IVF_PQ"
|
||||
assert stats.distance_type == "l2"
|
||||
assert stats.num_indexed_rows == await some_table.count_rows()
|
||||
assert stats.num_unindexed_rows == 0
|
||||
assert stats.num_indices == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@@ -200,6 +200,8 @@ pub struct IndexConfig {
|
||||
/// Currently this is always a list of size 1. In the future there may
|
||||
/// be more columns to represent composite indices.
|
||||
pub columns: Vec<String>,
|
||||
/// Name of the index.
|
||||
pub name: String,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
@@ -215,6 +217,7 @@ impl From<lancedb::index::IndexConfig> for IndexConfig {
|
||||
Self {
|
||||
index_type,
|
||||
columns: value.columns,
|
||||
name: value.name,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,8 +8,8 @@ use lancedb::table::{
|
||||
use pyo3::{
|
||||
exceptions::{PyRuntimeError, PyValueError},
|
||||
pyclass, pymethods,
|
||||
types::{PyDict, PyString},
|
||||
Bound, PyAny, PyRef, PyResult, Python,
|
||||
types::{PyDict, PyDictMethods, PyString},
|
||||
Bound, PyAny, PyRef, PyResult, Python, ToPyObject,
|
||||
};
|
||||
use pyo3_asyncio_0_21::tokio::future_into_py;
|
||||
|
||||
@@ -204,6 +204,33 @@ impl Table {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn index_stats(self_: PyRef<'_, Self>, index_name: String) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let stats = inner.index_stats(&index_name).await.infer_error()?;
|
||||
if let Some(stats) = stats {
|
||||
Python::with_gil(|py| {
|
||||
let dict = PyDict::new_bound(py);
|
||||
dict.set_item("num_indexed_rows", stats.num_indexed_rows)?;
|
||||
dict.set_item("num_unindexed_rows", stats.num_unindexed_rows)?;
|
||||
dict.set_item("index_type", stats.index_type.to_string())?;
|
||||
|
||||
if let Some(distance_type) = stats.distance_type {
|
||||
dict.set_item("distance_type", distance_type.to_string())?;
|
||||
}
|
||||
|
||||
if let Some(num_indices) = stats.num_indices {
|
||||
dict.set_item("num_indices", num_indices)?;
|
||||
}
|
||||
|
||||
Ok(Some(dict.to_object(py)))
|
||||
})
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn __repr__(&self) -> String {
|
||||
match &self.inner {
|
||||
None => format!("ClosedTable({})", self.name),
|
||||
|
||||
@@ -470,49 +470,42 @@ impl JsTable {
|
||||
Ok(promise)
|
||||
}
|
||||
|
||||
#[allow(deprecated)]
|
||||
pub(crate) fn js_index_stats(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||
let js_table = cx.this().downcast_or_throw::<JsBox<Self>, _>(&mut cx)?;
|
||||
let rt = runtime(&mut cx)?;
|
||||
let (deferred, promise) = cx.promise();
|
||||
let index_uuid = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||
let index_name = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||
let channel = cx.channel();
|
||||
let table = js_table.table.clone();
|
||||
|
||||
rt.spawn(async move {
|
||||
let load_stats = futures::try_join!(
|
||||
table.as_native().unwrap().count_indexed_rows(&index_uuid),
|
||||
table.as_native().unwrap().count_unindexed_rows(&index_uuid)
|
||||
);
|
||||
let load_stats = table.index_stats(index_name).await;
|
||||
|
||||
deferred.settle_with(&channel, move |mut cx| {
|
||||
let (indexed_rows, unindexed_rows) = load_stats.or_throw(&mut cx)?;
|
||||
let stats = load_stats.or_throw(&mut cx)?;
|
||||
|
||||
let output = JsObject::new(&mut cx);
|
||||
if let Some(stats) = stats {
|
||||
let output = JsObject::new(&mut cx);
|
||||
let num_indexed_rows = cx.number(stats.num_indexed_rows as f64);
|
||||
output.set(&mut cx, "numIndexedRows", num_indexed_rows)?;
|
||||
let num_unindexed_rows = cx.number(stats.num_unindexed_rows as f64);
|
||||
output.set(&mut cx, "numUnindexedRows", num_unindexed_rows)?;
|
||||
if let Some(distance_type) = stats.distance_type {
|
||||
let distance_type = cx.string(distance_type.to_string());
|
||||
output.set(&mut cx, "distanceType", distance_type)?;
|
||||
}
|
||||
let index_type = cx.string(stats.index_type.to_string());
|
||||
output.set(&mut cx, "indexType", index_type)?;
|
||||
|
||||
match indexed_rows {
|
||||
Some(x) => {
|
||||
let i = cx.number(x as f64);
|
||||
output.set(&mut cx, "numIndexedRows", i)?;
|
||||
if let Some(num_indices) = stats.num_indices {
|
||||
let num_indices = cx.number(num_indices as f64);
|
||||
output.set(&mut cx, "numIndices", num_indices)?;
|
||||
}
|
||||
None => {
|
||||
let null = cx.null();
|
||||
output.set(&mut cx, "numIndexedRows", null)?;
|
||||
}
|
||||
};
|
||||
|
||||
match unindexed_rows {
|
||||
Some(x) => {
|
||||
let i = cx.number(x as f64);
|
||||
output.set(&mut cx, "numUnindexedRows", i)?;
|
||||
}
|
||||
None => {
|
||||
let null = cx.null();
|
||||
output.set(&mut cx, "numUnindexedRows", null)?;
|
||||
}
|
||||
};
|
||||
|
||||
Ok(output)
|
||||
Ok(output.as_value(&mut cx))
|
||||
} else {
|
||||
Ok(JsNull::new(&mut cx).as_value(&mut cx))
|
||||
}
|
||||
})
|
||||
});
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ use scalar::FtsIndexBuilder;
|
||||
use serde::Deserialize;
|
||||
use serde_with::skip_serializing_none;
|
||||
|
||||
use crate::{table::TableInternal, Result};
|
||||
use crate::{table::TableInternal, DistanceType, Result};
|
||||
|
||||
use self::{
|
||||
scalar::{BTreeIndexBuilder, BitmapIndexBuilder, LabelListIndexBuilder},
|
||||
@@ -102,19 +102,42 @@ impl IndexBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
#[derive(Debug, Clone, PartialEq, Deserialize)]
|
||||
pub enum IndexType {
|
||||
// Vector
|
||||
#[serde(alias = "IVF_PQ")]
|
||||
IvfPq,
|
||||
#[serde(alias = "IVF_HNSW_PQ")]
|
||||
IvfHnswPq,
|
||||
#[serde(alias = "IVF_HNSW_SQ")]
|
||||
IvfHnswSq,
|
||||
// Scalar
|
||||
#[serde(alias = "BTREE")]
|
||||
BTree,
|
||||
#[serde(alias = "BITMAP")]
|
||||
Bitmap,
|
||||
#[serde(alias = "LABEL_LIST")]
|
||||
LabelList,
|
||||
// FTS
|
||||
FTS,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for IndexType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::IvfPq => write!(f, "IVF_PQ"),
|
||||
Self::IvfHnswPq => write!(f, "IVF_HNSW_PQ"),
|
||||
Self::IvfHnswSq => write!(f, "IVF_HNSW_SQ"),
|
||||
Self::BTree => write!(f, "BTREE"),
|
||||
Self::Bitmap => write!(f, "BITMAP"),
|
||||
Self::LabelList => write!(f, "LABEL_LIST"),
|
||||
Self::FTS => write!(f, "FTS"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A description of an index currently configured on a column
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
pub struct IndexConfig {
|
||||
/// The name of the index
|
||||
pub name: String,
|
||||
@@ -129,16 +152,39 @@ pub struct IndexConfig {
|
||||
|
||||
#[skip_serializing_none]
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct IndexMetadata {
|
||||
pub metric_type: Option<String>,
|
||||
pub index_type: Option<String>,
|
||||
pub(crate) struct IndexMetadata {
|
||||
pub metric_type: Option<DistanceType>,
|
||||
// Sometimes the index type is provided at this level.
|
||||
pub index_type: Option<IndexType>,
|
||||
}
|
||||
|
||||
// This struct is used to deserialize the JSON data returned from the Lance API
|
||||
// Dataset::index_statistics().
|
||||
#[skip_serializing_none]
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub(crate) struct IndexStatisticsImpl {
|
||||
pub num_indexed_rows: usize,
|
||||
pub num_unindexed_rows: usize,
|
||||
pub indices: Vec<IndexMetadata>,
|
||||
// Sometimes, the index type is provided at this level.
|
||||
pub index_type: Option<IndexType>,
|
||||
pub num_indices: Option<u32>,
|
||||
}
|
||||
|
||||
#[skip_serializing_none]
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[derive(Debug, Deserialize, PartialEq)]
|
||||
pub struct IndexStatistics {
|
||||
/// The number of rows in the table that are covered by this index.
|
||||
pub num_indexed_rows: usize,
|
||||
/// The number of rows in the table that are not covered by this index.
|
||||
/// These are rows that haven't yet been added to the index.
|
||||
pub num_unindexed_rows: usize,
|
||||
pub index_type: Option<String>,
|
||||
pub indices: Vec<IndexMetadata>,
|
||||
/// The type of the index.
|
||||
pub index_type: IndexType,
|
||||
/// The distance type used by the index.
|
||||
///
|
||||
/// This is only present for vector indices.
|
||||
pub distance_type: Option<DistanceType>,
|
||||
/// The number of parts this index is split into.
|
||||
pub num_indices: Option<u32>,
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use crate::index::Index;
|
||||
use crate::index::IndexStatistics;
|
||||
use crate::query::Select;
|
||||
use crate::table::AddDataMode;
|
||||
use crate::utils::{supported_btree_data_type, supported_vector_data_type};
|
||||
@@ -523,6 +524,26 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
||||
message: "list_indices is not yet supported.".into(),
|
||||
})
|
||||
}
|
||||
async fn index_stats(&self, index_name: &str) -> Result<Option<IndexStatistics>> {
|
||||
let request = self
|
||||
.client
|
||||
.post(&format!("/table/{}/index/{}/stats/", self.name, index_name));
|
||||
let response = self.client.send(request).await?;
|
||||
|
||||
if response.status() == StatusCode::NOT_FOUND {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let response = self.check_table_response(response).await?;
|
||||
|
||||
let body = response.text().await?;
|
||||
|
||||
let stats = serde_json::from_str(&body).map_err(|e| Error::Http {
|
||||
message: format!("Failed to parse index statistics: {}", e),
|
||||
})?;
|
||||
|
||||
Ok(Some(stats))
|
||||
}
|
||||
async fn table_definition(&self) -> Result<TableDefinition> {
|
||||
Err(Error::NotSupported {
|
||||
message: "table_definition is not supported on LanceDB cloud.".into(),
|
||||
@@ -582,7 +603,7 @@ mod tests {
|
||||
use reqwest::Body;
|
||||
|
||||
use crate::{
|
||||
index::{vector::IvfPqIndexBuilder, Index},
|
||||
index::{vector::IvfPqIndexBuilder, Index, IndexStatistics, IndexType},
|
||||
query::{ExecutableQuery, QueryBase},
|
||||
DistanceType, Error, Table,
|
||||
};
|
||||
@@ -1152,4 +1173,49 @@ mod tests {
|
||||
table.create_index(&["a"], index).execute().await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_index_stats() {
|
||||
let table = Table::new_with_handler("my_table", |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(
|
||||
request.url().path(),
|
||||
"/table/my_table/index/my_index/stats/"
|
||||
);
|
||||
|
||||
let response_body = serde_json::json!({
|
||||
"num_indexed_rows": 100000,
|
||||
"num_unindexed_rows": 0,
|
||||
"index_type": "IVF_PQ",
|
||||
"distance_type": "l2"
|
||||
});
|
||||
let response_body = serde_json::to_string(&response_body).unwrap();
|
||||
|
||||
http::Response::builder()
|
||||
.status(200)
|
||||
.body(response_body)
|
||||
.unwrap()
|
||||
});
|
||||
let indices = table.index_stats("my_index").await.unwrap().unwrap();
|
||||
let expected = IndexStatistics {
|
||||
num_indexed_rows: 100000,
|
||||
num_unindexed_rows: 0,
|
||||
index_type: IndexType::IvfPq,
|
||||
distance_type: Some(DistanceType::L2),
|
||||
num_indices: None,
|
||||
};
|
||||
assert_eq!(indices, expected);
|
||||
|
||||
let table = Table::new_with_handler("my_table", |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(
|
||||
request.url().path(),
|
||||
"/table/my_table/index/my_index/stats/"
|
||||
);
|
||||
|
||||
http::Response::builder().status(404).body("").unwrap()
|
||||
});
|
||||
let indices = table.index_stats("my_index").await.unwrap();
|
||||
assert!(indices.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,7 +47,6 @@ use lance_index::IndexType;
|
||||
use lance_table::io::commit::ManifestNamingScheme;
|
||||
use log::info;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use snafu::whatever;
|
||||
|
||||
use crate::arrow::IntoArrow;
|
||||
use crate::connection::NoData;
|
||||
@@ -58,12 +57,12 @@ use crate::index::vector::{
|
||||
suggested_num_partitions_for_hnsw, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder,
|
||||
IvfPqIndexBuilder, VectorIndex,
|
||||
};
|
||||
use crate::index::IndexConfig;
|
||||
use crate::index::IndexStatistics;
|
||||
use crate::index::{
|
||||
vector::{suggested_num_partitions, suggested_num_sub_vectors},
|
||||
Index, IndexBuilder,
|
||||
};
|
||||
use crate::index::{IndexConfig, IndexStatisticsImpl};
|
||||
use crate::query::{
|
||||
IntoQueryVector, Query, QueryExecutionOptions, Select, VectorQuery, DEFAULT_TOP_K,
|
||||
};
|
||||
@@ -405,6 +404,7 @@ pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Syn
|
||||
async fn update(&self, update: UpdateBuilder) -> Result<u64>;
|
||||
async fn create_index(&self, index: IndexBuilder) -> Result<()>;
|
||||
async fn list_indices(&self) -> Result<Vec<IndexConfig>>;
|
||||
async fn index_stats(&self, index_name: &str) -> Result<Option<IndexStatistics>>;
|
||||
async fn merge_insert(
|
||||
&self,
|
||||
params: MergeInsertBuilder,
|
||||
@@ -962,6 +962,15 @@ impl Table {
|
||||
pub fn dataset_uri(&self) -> &str {
|
||||
self.inner.dataset_uri()
|
||||
}
|
||||
|
||||
/// Get statistics about an index.
|
||||
/// Returns None if the index does not exist.
|
||||
pub async fn index_stats(
|
||||
&self,
|
||||
index_name: impl AsRef<str>,
|
||||
) -> Result<Option<IndexStatistics>> {
|
||||
self.inner.index_stats(index_name.as_ref()).await
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NativeTable> for Table {
|
||||
@@ -1250,91 +1259,6 @@ impl NativeTable {
|
||||
.await)
|
||||
}
|
||||
|
||||
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
|
||||
pub async fn count_indexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
|
||||
#[allow(deprecated)]
|
||||
match self.load_index_stats(index_uuid).await? {
|
||||
Some(stats) => Ok(Some(stats.num_indexed_rows)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
|
||||
pub async fn count_unindexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
|
||||
#[allow(deprecated)]
|
||||
match self.load_index_stats(index_uuid).await? {
|
||||
Some(stats) => Ok(Some(stats.num_unindexed_rows)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
|
||||
pub async fn get_index_type(&self, index_uuid: &str) -> Result<Option<String>> {
|
||||
#[allow(deprecated)]
|
||||
match self.load_index_stats(index_uuid).await? {
|
||||
Some(stats) => Ok(Some(stats.index_type.unwrap_or_default())),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
|
||||
pub async fn get_distance_type(&self, index_uuid: &str) -> Result<Option<String>> {
|
||||
#[allow(deprecated)]
|
||||
match self.load_index_stats(index_uuid).await? {
|
||||
Some(stats) => Ok(Some(
|
||||
stats
|
||||
.indices
|
||||
.iter()
|
||||
.filter_map(|i| i.metric_type.clone())
|
||||
.collect(),
|
||||
)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[deprecated(since = "0.5.2", note = "Please use `index_stats` instead")]
|
||||
pub async fn load_index_stats(&self, index_uuid: &str) -> Result<Option<IndexStatistics>> {
|
||||
let index = self
|
||||
.load_indices()
|
||||
.await?
|
||||
.into_iter()
|
||||
.find(|i| i.index_uuid == index_uuid);
|
||||
if index.is_none() {
|
||||
return Ok(None);
|
||||
}
|
||||
let dataset = self.dataset.get().await?;
|
||||
let index_stats = dataset.index_statistics(&index.unwrap().index_name).await?;
|
||||
let index_stats: IndexStatistics = whatever!(
|
||||
serde_json::from_str(&index_stats),
|
||||
"error deserializing index statistics {index_stats}",
|
||||
);
|
||||
|
||||
Ok(Some(index_stats))
|
||||
}
|
||||
|
||||
/// Get statistics about an index.
|
||||
/// Returns an error if the index does not exist.
|
||||
pub async fn index_stats(
|
||||
&self,
|
||||
index_name: impl AsRef<str>,
|
||||
) -> Result<Option<IndexStatistics>> {
|
||||
let stats = match self
|
||||
.dataset
|
||||
.get()
|
||||
.await?
|
||||
.index_statistics(index_name.as_ref())
|
||||
.await
|
||||
{
|
||||
Ok(stats) => stats,
|
||||
Err(lance::error::Error::IndexNotFound { .. }) => return Ok(None),
|
||||
Err(e) => return Err(Error::from(e)),
|
||||
};
|
||||
|
||||
serde_json::from_str(&stats).map_err(|e| Error::InvalidInput {
|
||||
message: format!("error deserializing index statistics: {}", e),
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn load_indices(&self) -> Result<Vec<VectorIndex>> {
|
||||
let dataset = self.dataset.get().await?;
|
||||
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
|
||||
@@ -2126,6 +2050,44 @@ impl TableInternal for NativeTable {
|
||||
fn dataset_uri(&self) -> &str {
|
||||
self.uri.as_str()
|
||||
}
|
||||
|
||||
async fn index_stats(&self, index_name: &str) -> Result<Option<IndexStatistics>> {
|
||||
let stats = match self
|
||||
.dataset
|
||||
.get()
|
||||
.await?
|
||||
.index_statistics(index_name.as_ref())
|
||||
.await
|
||||
{
|
||||
Ok(stats) => stats,
|
||||
Err(lance::error::Error::IndexNotFound { .. }) => return Ok(None),
|
||||
Err(e) => return Err(Error::from(e)),
|
||||
};
|
||||
|
||||
let mut stats: IndexStatisticsImpl =
|
||||
serde_json::from_str(&stats).map_err(|e| Error::InvalidInput {
|
||||
message: format!("error deserializing index statistics: {}", e),
|
||||
})?;
|
||||
|
||||
let first_index = stats.indices.pop().ok_or_else(|| Error::InvalidInput {
|
||||
message: "index statistics is empty".to_string(),
|
||||
})?;
|
||||
// Index type should be present at one of the levels.
|
||||
let index_type =
|
||||
stats
|
||||
.index_type
|
||||
.or(first_index.index_type)
|
||||
.ok_or_else(|| Error::InvalidInput {
|
||||
message: "index statistics was missing index type".to_string(),
|
||||
})?;
|
||||
Ok(Some(IndexStatistics {
|
||||
num_indexed_rows: stats.num_indexed_rows,
|
||||
num_unindexed_rows: stats.num_unindexed_rows,
|
||||
index_type,
|
||||
distance_type: first_index.metric_type,
|
||||
num_indices: stats.num_indices,
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -2763,24 +2725,7 @@ mod tests {
|
||||
|
||||
let table = conn.create_table("test", batches).execute().await.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.count_indexed_rows("my_index")
|
||||
.await
|
||||
.unwrap(),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.count_unindexed_rows("my_index")
|
||||
.await
|
||||
.unwrap(),
|
||||
None
|
||||
);
|
||||
assert_eq!(table.index_stats("my_index").await.unwrap(), None);
|
||||
|
||||
table
|
||||
.create_index(&["embeddings"], Index::Auto)
|
||||
@@ -2797,43 +2742,12 @@ mod tests {
|
||||
assert_eq!(table.name(), "test");
|
||||
|
||||
let indices = table.as_native().unwrap().load_indices().await.unwrap();
|
||||
let index_uuid = &indices[0].index_uuid;
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.count_indexed_rows(index_uuid)
|
||||
.await
|
||||
.unwrap(),
|
||||
Some(512)
|
||||
);
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.count_unindexed_rows(index_uuid)
|
||||
.await
|
||||
.unwrap(),
|
||||
Some(0)
|
||||
);
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.get_index_type(index_uuid)
|
||||
.await
|
||||
.unwrap(),
|
||||
Some("IVF_PQ".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.get_distance_type(index_uuid)
|
||||
.await
|
||||
.unwrap(),
|
||||
Some(crate::DistanceType::L2.to_string())
|
||||
);
|
||||
let index_name = &indices[0].index_name;
|
||||
let stats = table.index_stats(index_name).await.unwrap().unwrap();
|
||||
assert_eq!(stats.num_indexed_rows, 512);
|
||||
assert_eq!(stats.num_unindexed_rows, 0);
|
||||
assert_eq!(stats.index_type, crate::index::IndexType::IvfPq);
|
||||
assert_eq!(stats.distance_type, Some(crate::DistanceType::L2));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -2876,24 +2790,8 @@ mod tests {
|
||||
|
||||
let table = conn.create_table("test", batches).execute().await.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.count_indexed_rows("my_index")
|
||||
.await
|
||||
.unwrap(),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.count_unindexed_rows("my_index")
|
||||
.await
|
||||
.unwrap(),
|
||||
None
|
||||
);
|
||||
let stats = table.index_stats("my_index").await.unwrap();
|
||||
assert!(stats.is_none());
|
||||
|
||||
let index = IvfHnswSqIndexBuilder::default();
|
||||
table
|
||||
@@ -2911,25 +2809,10 @@ mod tests {
|
||||
assert_eq!(table.name(), "test");
|
||||
|
||||
let indices = table.as_native().unwrap().load_indices().await.unwrap();
|
||||
let index_uuid = &indices[0].index_uuid;
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.count_indexed_rows(index_uuid)
|
||||
.await
|
||||
.unwrap(),
|
||||
Some(512)
|
||||
);
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.count_unindexed_rows(index_uuid)
|
||||
.await
|
||||
.unwrap(),
|
||||
Some(0)
|
||||
);
|
||||
let index_name = &indices[0].index_name;
|
||||
let stats = table.index_stats(index_name).await.unwrap().unwrap();
|
||||
assert_eq!(stats.num_indexed_rows, 512);
|
||||
assert_eq!(stats.num_unindexed_rows, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -2971,25 +2854,8 @@ mod tests {
|
||||
);
|
||||
|
||||
let table = conn.create_table("test", batches).execute().await.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.count_indexed_rows("my_index")
|
||||
.await
|
||||
.unwrap(),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.count_unindexed_rows("my_index")
|
||||
.await
|
||||
.unwrap(),
|
||||
None
|
||||
);
|
||||
let stats = table.index_stats("my_index").await.unwrap();
|
||||
assert!(stats.is_none());
|
||||
|
||||
let index = IvfHnswPqIndexBuilder::default();
|
||||
table
|
||||
@@ -3006,26 +2872,11 @@ mod tests {
|
||||
assert_eq!(table.count_rows(None).await.unwrap(), 512);
|
||||
assert_eq!(table.name(), "test");
|
||||
|
||||
let indices = table.as_native().unwrap().load_indices().await.unwrap();
|
||||
let index_uuid = &indices[0].index_uuid;
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.count_indexed_rows(index_uuid)
|
||||
.await
|
||||
.unwrap(),
|
||||
Some(512)
|
||||
);
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.count_unindexed_rows(index_uuid)
|
||||
.await
|
||||
.unwrap(),
|
||||
Some(0)
|
||||
);
|
||||
let indices: Vec<VectorIndex> = table.as_native().unwrap().load_indices().await.unwrap();
|
||||
let index_name = &indices[0].index_name;
|
||||
let stats = table.index_stats(index_name).await.unwrap().unwrap();
|
||||
assert_eq!(stats.num_indexed_rows, 512);
|
||||
assert_eq!(stats.num_unindexed_rows, 0);
|
||||
}
|
||||
|
||||
fn create_fixed_size_list<T: Array>(values: T, list_size: i32) -> Result<FixedSizeListArray> {
|
||||
@@ -3101,25 +2952,10 @@ mod tests {
|
||||
assert_eq!(index.columns, vec!["i".to_string()]);
|
||||
|
||||
let indices = table.as_native().unwrap().load_indices().await.unwrap();
|
||||
let index_uuid = &indices[0].index_uuid;
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.count_indexed_rows(index_uuid)
|
||||
.await
|
||||
.unwrap(),
|
||||
Some(1)
|
||||
);
|
||||
assert_eq!(
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.count_unindexed_rows(index_uuid)
|
||||
.await
|
||||
.unwrap(),
|
||||
Some(0)
|
||||
);
|
||||
let index_name = &indices[0].index_name;
|
||||
let stats = table.index_stats(index_name).await.unwrap().unwrap();
|
||||
assert_eq!(stats.num_indexed_rows, 1);
|
||||
assert_eq!(stats.num_unindexed_rows, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
Reference in New Issue
Block a user