diff --git a/Cargo.lock b/Cargo.lock index 8bd1bd375..2efd4eb98 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5413,6 +5413,7 @@ dependencies = [ "async-trait", "aws-lc-rs", "aws-lc-sys", + "chrono", "env_logger", "futures", "half", @@ -5423,6 +5424,7 @@ dependencies = [ "napi", "napi-build", "napi-derive", + "serde_json", ] [[package]] @@ -5432,6 +5434,7 @@ dependencies = [ "arrow", "async-trait", "bytes", + "chrono", "datafusion-common", "env_logger", "futures", @@ -5974,12 +5977,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ad513ff22558f1830b595ea6eb4091da48145d09a222ce157e781896f78be0b9" dependencies = [ "bitflags 2.11.1", + "chrono", "ctor 1.0.5", "futures", "napi-build", "napi-sys", "nohash-hasher", "rustc-hash", + "serde", + "serde_json", "tokio", ] @@ -7509,6 +7515,7 @@ version = "0.28.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91fd8e38a3b50ed1167fb981cd6fd60147e091784c427b8f7183a7ee32c31c12" dependencies = [ + "chrono", "libc", "once_cell", "portable-atomic", diff --git a/docs/src/js/interfaces/IndexConfig.md b/docs/src/js/interfaces/IndexConfig.md index 84fba7bf4..6622c0189 100644 --- a/docs/src/js/interfaces/IndexConfig.md +++ b/docs/src/js/interfaces/IndexConfig.md @@ -23,6 +23,31 @@ be more columns to represent composite indices. *** +### createdAt? + +```ts +optional createdAt: Date; +``` + +When the index was created. + +`undefined` for remote tables or indices created before timestamps were tracked. + +*** + +### indexDetails? + +```ts +optional indexDetails: any; +``` + +Index-type-specific details parsed as a JavaScript object. + +Falls back to a raw string if JSON parsing fails. `undefined` for +remote tables or when details are unavailable. + +*** + ### indexType ```ts @@ -33,6 +58,30 @@ The type of the index *** +### indexUuid? + +```ts +optional indexUuid: string; +``` + +The UUID of the first segment of the index. + +`undefined` for remote tables, which do not yet surface this. + +*** + +### indexVersion? + +```ts +optional indexVersion: number; +``` + +The on-disk index format version. + +`undefined` for remote tables. + +*** + ### name ```ts @@ -40,3 +89,63 @@ name: string; ``` The name of the index + +*** + +### numIndexedRows? + +```ts +optional numIndexedRows: number; +``` + +The number of rows indexed, across all segments. + +`undefined` for remote tables. + +*** + +### numSegments? + +```ts +optional numSegments: number; +``` + +The number of segments that make up the index. + +`undefined` for remote tables. + +*** + +### numUnindexedRows? + +```ts +optional numUnindexedRows: number; +``` + +The number of rows not yet covered by this index. + +`undefined` for remote tables. + +*** + +### sizeBytes? + +```ts +optional sizeBytes: number; +``` + +The total size in bytes of all index files across all segments. + +`undefined` for remote tables or indices without size tracking. + +*** + +### typeUrl? + +```ts +optional typeUrl: string; +``` + +The protobuf type URL, a precise type identifier for the index. + +`undefined` for remote tables. diff --git a/nodejs/Cargo.toml b/nodejs/Cargo.toml index c46c5a4cd..2419ee00e 100644 --- a/nodejs/Cargo.toml +++ b/nodejs/Cargo.toml @@ -25,8 +25,12 @@ lancedb = { path = "../rust/lancedb", default-features = false } lance-namespace.workspace = true napi = { version = "3.8.3", default-features = false, features = [ "napi9", - "async" + "async", + "chrono_date", + "serde-json", ] } +chrono = { version = "0.4", default-features = false, features = ["clock"] } +serde_json = "1" napi-derive = "3.5.2" # Prevent dynamic linking of lzma, which comes from datafusion lzma-sys = { version = "0.1", features = ["static"] } diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts index 12406c402..c808f1679 100644 --- a/nodejs/__test__/table.test.ts +++ b/nodejs/__test__/table.test.ts @@ -845,11 +845,13 @@ describe("When creating an index", () => { expect(fs.readdirSync(indexDir)).toHaveLength(1); const indices = await tbl.listIndices(); expect(indices.length).toBe(1); - expect(indices[0]).toEqual({ - name: "vec_idx", - indexType: "IvfPq", - columns: ["vec"], - }); + expect(indices[0]).toEqual( + expect.objectContaining({ + name: "vec_idx", + indexType: "IvfPq", + columns: ["vec"], + }), + ); const stats = await tbl.indexStats("vec_idx"); expect(stats).toBeDefined(); @@ -1011,51 +1013,51 @@ describe("When creating an index", () => { const indices = await nestedTable.listIndices(); expect(indices).toEqual( expect.arrayContaining([ - { + expect.objectContaining({ name: "row_id_idx", indexType: "BTree", columns: ["rowId"], - }, - { + }), + expect.objectContaining({ name: "row_dash_id_idx", indexType: "BTree", columns: ["`row-id`"], - }, - { + }), + expect.objectContaining({ name: "top_user_id_idx", indexType: "BTree", columns: ["userId"], - }, - { + }), + expect.objectContaining({ name: "nested_user_id_idx", indexType: "BTree", columns: ["metadata.user_id"], - }, - { + }), + expect.objectContaining({ name: "mixed_case_metadata_user_id_idx", indexType: "BTree", columns: ["MetaData.userId"], - }, - { + }), + expect.objectContaining({ name: "escaped_names_idx", indexType: "BTree", columns: ["`meta-data`.`user-id`"], - }, - { + }), + expect.objectContaining({ name: "literal_dot_idx", indexType: "BTree", columns: ["literal.`a.b`"], - }, - { + }), + expect.objectContaining({ name: "image_embedding_idx", indexType: "IvfPq", columns: ["image.embedding"], - }, - { + }), + expect.objectContaining({ name: "payload_text_idx", indexType: "FTS", columns: ["payload.text"], - }, + }), ]), ); @@ -1109,16 +1111,16 @@ describe("When creating an index", () => { const indicesAfterOptimize = await nestedTable.listIndices(); expect(indicesAfterOptimize).toEqual( expect.arrayContaining([ - { + expect.objectContaining({ name: "mixed_case_metadata_user_id_idx", indexType: "BTree", columns: ["MetaData.userId"], - }, - { + }), + expect.objectContaining({ name: "image_embedding_idx", indexType: "IvfPq", columns: ["image.embedding"], - }, + }), ]), ); }); @@ -1254,11 +1256,13 @@ describe("When creating an index", () => { expect(fs.readdirSync(indexDir)).toHaveLength(1); const indices = await tbl.listIndices(); expect(indices.length).toBe(1); - expect(indices[0]).toEqual({ - name: "vec_idx", - indexType: "IvfHnswSq", - columns: ["vec"], - }); + expect(indices[0]).toEqual( + expect.objectContaining({ + name: "vec_idx", + indexType: "IvfHnswSq", + columns: ["vec"], + }), + ); // Search without specifying the column let rst = await tbl @@ -1604,6 +1608,35 @@ describe("When creating an index", () => { expect(rst64Query.toString()).toEqual(rst64Search.toString()); expect(rst64Query.numRows).toBe(2); }); + + it("should expose rich metadata fields on IndexConfig", async () => { + await tbl.createIndex("id", { config: Index.btree() }); + await tbl.createIndex("vec"); + + const indicesByName = Object.fromEntries( + (await tbl.listIndices()).map((idx) => [idx.name, idx]), + ); + + const scalarIdx = indicesByName["id_idx"]; + expect(scalarIdx).toBeDefined(); + expect(typeof scalarIdx.indexUuid).toBe("string"); + expect(scalarIdx.numIndexedRows).toBe(300); + expect(scalarIdx.numUnindexedRows).toBe(0); + expect(scalarIdx.numSegments).toBeGreaterThanOrEqual(1); + expect(scalarIdx.sizeBytes).toBeGreaterThan(0); + // Use toString check to avoid cross-realm instanceof failures with native Date objects + expect(Object.prototype.toString.call(scalarIdx.createdAt)).toBe( + "[object Date]", + ); + expect((scalarIdx.createdAt as Date).getTime()).toBeGreaterThan(0); + expect(typeof scalarIdx.indexDetails).toBe("object"); + + const vectorIdx = indicesByName["vec_idx"]; + expect(vectorIdx).toBeDefined(); + expect(typeof vectorIdx.indexUuid).toBe("string"); + expect(vectorIdx.numIndexedRows).toBe(300); + expect(typeof vectorIdx.indexDetails).toBe("object"); + }); }); describe("When querying a table", () => { diff --git a/nodejs/src/table.rs b/nodejs/src/table.rs index 56bb3b676..b53be573a 100644 --- a/nodejs/src/table.rs +++ b/nodejs/src/table.rs @@ -3,6 +3,8 @@ use std::collections::HashMap; +use chrono::{DateTime, Utc}; + use lancedb::ipc::{ipc_file_to_batches, ipc_file_to_schema}; use lancedb::table::{ AddDataMode, ColumnAlteration as LanceColumnAlteration, Duration, @@ -602,6 +604,43 @@ pub struct IndexConfig { /// Currently this is always an array of size 1. In the future there may /// be more columns to represent composite indices. pub columns: Vec, + /// The UUID of the first segment of the index. + /// + /// `undefined` for remote tables, which do not yet surface this. + pub index_uuid: Option, + /// The protobuf type URL, a precise type identifier for the index. + /// + /// `undefined` for remote tables. + pub type_url: Option, + /// When the index was created. + /// + /// `undefined` for remote tables or indices created before timestamps were tracked. + pub created_at: Option>, + /// The number of rows indexed, across all segments. + /// + /// `undefined` for remote tables. + pub num_indexed_rows: Option, + /// The number of rows not yet covered by this index. + /// + /// `undefined` for remote tables. + pub num_unindexed_rows: Option, + /// The total size in bytes of all index files across all segments. + /// + /// `undefined` for remote tables or indices without size tracking. + pub size_bytes: Option, + /// The number of segments that make up the index. + /// + /// `undefined` for remote tables. + pub num_segments: Option, + /// The on-disk index format version. + /// + /// `undefined` for remote tables. + pub index_version: Option, + /// Index-type-specific details parsed as a JavaScript object. + /// + /// Falls back to a raw string if JSON parsing fails. `undefined` for + /// remote tables or when details are unavailable. + pub index_details: Option, } impl From for IndexConfig { @@ -611,6 +650,17 @@ impl From for IndexConfig { index_type, columns: value.columns, name: value.name, + index_uuid: value.index_uuid, + type_url: value.type_url, + created_at: value.created_at, + num_indexed_rows: value.num_indexed_rows.map(|n| n as i64), + num_unindexed_rows: value.num_unindexed_rows.map(|n| n as i64), + size_bytes: value.size_bytes.map(|n| n as i64), + num_segments: value.num_segments.map(|n| n as i32), + index_version: value.index_version, + index_details: value + .index_details + .and_then(|s| serde_json::from_str(&s).ok()), } } } diff --git a/python/Cargo.toml b/python/Cargo.toml index 3ea4cf31f..8457c3c1f 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -26,7 +26,8 @@ lance-namespace-impls.workspace = true lance-io.workspace = true env_logger.workspace = true log.workspace = true -pyo3 = { version = "0.28", features = ["extension-module", "abi3-py39"] } +pyo3 = { version = "0.28", features = ["extension-module", "abi3-py39", "chrono"] } +chrono = { version = "0.4", default-features = false, features = ["clock"] } pyo3-async-runtimes = { version = "0.28", features = [ "attributes", "tokio-runtime", diff --git a/python/python/lancedb/_lancedb.pyi b/python/python/lancedb/_lancedb.pyi index 739baaee6..8ddb28604 100644 --- a/python/python/lancedb/_lancedb.pyi +++ b/python/python/lancedb/_lancedb.pyi @@ -1,4 +1,4 @@ -from datetime import timedelta +from datetime import datetime, timedelta from typing import Dict, List, Optional, Tuple, Any, TypedDict, Union, Literal import pyarrow as pa @@ -259,6 +259,15 @@ class IndexConfig: name: str index_type: str columns: List[str] + index_uuid: Optional[str] + type_url: Optional[str] + created_at: Optional[datetime] + num_indexed_rows: Optional[int] + num_unindexed_rows: Optional[int] + size_bytes: Optional[int] + num_segments: Optional[int] + index_version: Optional[int] + index_details: Optional[Any] async def connect( uri: str, diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py index 6b9d723a7..b30877ade 100644 --- a/python/python/tests/test_table.py +++ b/python/python/tests/test_table.py @@ -2566,6 +2566,55 @@ def test_create_index_nested_field_paths(mem_db: DBConnection): assert fts_results[0]["payload"]["text"] == "document 44" +def test_index_config_fields(mem_db: DBConnection): + """Test that IndexConfig exposes the new rich metadata fields.""" + vec_array = pa.array( + [[float(i), float(i + 1)] for i in range(300)], pa.list_(pa.float32(), 2) + ) + data = pa.Table.from_pydict({"x": list(range(300)), "vector": vec_array}) + table = mem_db.create_table("index_config_fields", data=data) + table.create_scalar_index("x", index_type="BTREE") + table.create_index( + vector_column_name="vector", + num_partitions=1, + num_sub_vectors=1, + ) + + indices = {idx.name: idx for idx in table.list_indices()} + + scalar_idx = indices["x_idx"] + assert scalar_idx.index_uuid is not None + assert isinstance(scalar_idx.index_uuid, str) + assert scalar_idx.num_indexed_rows is not None + assert scalar_idx.num_indexed_rows == 300 + assert scalar_idx.num_unindexed_rows is not None + assert scalar_idx.num_unindexed_rows == 0 + assert scalar_idx.num_segments is not None + assert scalar_idx.num_segments >= 1 + assert scalar_idx.size_bytes is not None + assert scalar_idx.size_bytes > 0 + assert scalar_idx.created_at is not None + from datetime import datetime, timezone + + assert isinstance(scalar_idx.created_at, datetime) + assert scalar_idx.created_at.tzinfo == timezone.utc + + # __getitem__ compatibility + assert scalar_idx["index_uuid"] == scalar_idx.index_uuid + assert scalar_idx["num_indexed_rows"] == scalar_idx.num_indexed_rows + assert scalar_idx["created_at"] == scalar_idx.created_at + + # index_details is parsed from JSON into a Python object + assert scalar_idx.index_details is not None + assert isinstance(scalar_idx.index_details, dict) + assert scalar_idx["index_details"] == scalar_idx.index_details + + vector_idx = indices["vector_idx"] + assert vector_idx.index_uuid is not None + assert vector_idx.num_indexed_rows == 300 + assert isinstance(vector_idx.index_details, dict) + + def test_empty_query(mem_db: DBConnection): table = mem_db.create_table( "my_table", diff --git a/python/src/index.rs b/python/src/index.rs index 175f37093..50407dd21 100644 --- a/python/src/index.rs +++ b/python/src/index.rs @@ -1,6 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The LanceDB Authors +use chrono::{DateTime, Utc}; use lancedb::index::vector::{ IvfFlatIndexBuilder, IvfHnswFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder, IvfRqIndexBuilder, IvfSqIndexBuilder, @@ -12,7 +13,7 @@ use lancedb::index::{ use pyo3::IntoPyObject; use pyo3::types::PyStringMethods; use pyo3::{ - Bound, FromPyObject, PyAny, PyResult, Python, + Bound, FromPyObject, Py, PyAny, PyResult, Python, exceptions::{PyKeyError, PyValueError}, intern, pyclass, pymethods, types::{PyAnyMethods, PyString}, @@ -294,6 +295,26 @@ pub struct IndexConfig { pub columns: Vec, /// Name of the index. pub name: String, + /// The UUID of the first segment of the index. + pub index_uuid: Option, + /// The protobuf type URL, a precise type identifier for the index. + pub type_url: Option, + /// When the index was created. + pub created_at: Option>, + /// The number of rows indexed, across all segments. + pub num_indexed_rows: Option, + /// The number of rows not yet covered by this index. + pub num_unindexed_rows: Option, + /// The total size in bytes of all index files across all segments. + pub size_bytes: Option, + /// The number of segments that make up the index. + pub num_segments: Option, + /// The on-disk index format version. + pub index_version: Option, + /// Index-type-specific details parsed as a Python object (dict, list, etc.). + /// + /// Falls back to a raw string if JSON parsing fails. `None` when unavailable. + pub index_details: Option>, } #[pymethods] @@ -312,18 +333,49 @@ impl IndexConfig { "index_type" => Ok(self.index_type.clone().into_pyobject(py)?.into_any()), "columns" => Ok(self.columns.clone().into_pyobject(py)?.into_any()), "name" | "index_name" => Ok(self.name.clone().into_pyobject(py)?.into_any()), + "index_uuid" => Ok(self.index_uuid.clone().into_pyobject(py)?.into_any()), + "type_url" => Ok(self.type_url.clone().into_pyobject(py)?.into_any()), + "created_at" => Ok(self.created_at.into_pyobject(py)?.into_any()), + "num_indexed_rows" => Ok(self.num_indexed_rows.into_pyobject(py)?.into_any()), + "num_unindexed_rows" => Ok(self.num_unindexed_rows.into_pyobject(py)?.into_any()), + "size_bytes" => Ok(self.size_bytes.into_pyobject(py)?.into_any()), + "num_segments" => Ok(self.num_segments.into_pyobject(py)?.into_any()), + "index_version" => Ok(self.index_version.into_pyobject(py)?.into_any()), + "index_details" => Ok(self + .index_details + .as_ref() + .map(|obj| obj.clone_ref(py)) + .into_pyobject(py)? + .into_any()), _ => Err(PyKeyError::new_err(format!("Invalid key: {}", key))), } } } -impl From for IndexConfig { - fn from(value: lancedb::index::IndexConfig) -> Self { +fn parse_index_details(py: Python<'_>, s: String) -> Py { + let json = py.import("json").expect("json module is always available"); + match json.call_method1("loads", (s.as_str(),)) { + Ok(obj) => obj.into_any().unbind(), + Err(_) => s.into_pyobject(py).unwrap().into_any().unbind(), + } +} + +impl IndexConfig { + pub fn from_lancedb(py: Python<'_>, value: lancedb::index::IndexConfig) -> Self { let index_type = format!("{:?}", value.index_type); Self { index_type, columns: value.columns, name: value.name, + index_uuid: value.index_uuid, + type_url: value.type_url, + created_at: value.created_at, + num_indexed_rows: value.num_indexed_rows, + num_unindexed_rows: value.num_unindexed_rows, + size_bytes: value.size_bytes, + num_segments: value.num_segments, + index_version: value.index_version, + index_details: value.index_details.map(|s| parse_index_details(py, s)), } } } diff --git a/python/src/table.rs b/python/src/table.rs index fd3857249..611f3c9f2 100644 --- a/python/src/table.rs +++ b/python/src/table.rs @@ -694,13 +694,13 @@ impl Table { pub fn list_indices(self_: PyRef<'_, Self>) -> PyResult> { let inner = self_.inner_ref()?.clone(); future_into_py(self_.py(), async move { - Ok(inner - .list_indices() - .await - .infer_error()? - .into_iter() - .map(IndexConfig::from) - .collect::>()) + let indices = inner.list_indices().await.infer_error()?; + Python::attach(|py| { + Ok(indices + .into_iter() + .map(|idx| IndexConfig::from_lancedb(py, idx)) + .collect::>()) + }) }) }