diff --git a/python/python/tests/test_index.py b/python/python/tests/test_index.py index 80f9f2673..0c71fc87b 100644 --- a/python/python/tests/test_index.py +++ b/python/python/tests/test_index.py @@ -105,6 +105,46 @@ async def test_create_scalar_index(some_table: AsyncTable): assert len(indices) == 0 +@pytest.mark.asyncio +async def test_create_nested_scalar_index_lists_canonical_paths(db_async): + metadata_type = pa.struct( + [ + pa.field("user_id", pa.int32()), + pa.field("user.id", pa.int32()), + ] + ) + data = pa.Table.from_arrays( + [ + pa.array([1, 2, 3], type=pa.int32()), + pa.array( + [ + {"user_id": 10, "user.id": 100}, + {"user_id": 20, "user.id": 200}, + {"user_id": 30, "user.id": 300}, + ], + type=metadata_type, + ), + ], + names=["user_id", "metadata"], + ) + table = await db_async.create_table("nested_scalar_index", data) + + await table.create_index("user_id", config=BTree(), name="top_user_id_idx") + await table.create_index( + "metadata.user_id", config=BTree(), name="nested_user_id_idx" + ) + await table.create_index( + "metadata.`user.id`", config=BTree(), name="escaped_user_id_idx" + ) + + columns_by_name = { + index.name: index.columns for index in await table.list_indices() + } + assert columns_by_name["top_user_id_idx"] == ["user_id"] + assert columns_by_name["nested_user_id_idx"] == ["metadata.user_id"] + assert columns_by_name["escaped_user_id_idx"] == ["metadata.`user.id`"] + + @pytest.mark.asyncio async def test_create_fixed_size_binary_index(some_table: AsyncTable): await some_table.create_index("fsb", config=BTree()) diff --git a/rust/lancedb/src/index/vector.rs b/rust/lancedb/src/index/vector.rs index 7e62c9f6c..29e01a49b 100644 --- a/rust/lancedb/src/index/vector.rs +++ b/rust/lancedb/src/index/vector.rs @@ -23,17 +23,12 @@ impl VectorIndex { .fields .iter() .map(|field_id| { - manifest - .schema - .field_by_id(*field_id) - .unwrap_or_else(|| { - panic!( - "field {field_id} of index {} must exist in schema", - index.name - ) - }) - .name - .clone() + manifest.schema.field_path(*field_id).unwrap_or_else(|_| { + panic!( + "field {field_id} of index {} must exist in schema", + index.name + ) + }) }) .collect(); Self { diff --git a/rust/lancedb/src/remote/table.rs b/rust/lancedb/src/remote/table.rs index 2d8f626cc..1bad181d9 100644 --- a/rust/lancedb/src/remote/table.rs +++ b/rust/lancedb/src/remote/table.rs @@ -3505,7 +3505,7 @@ mod tests { { "index_name": "my_idx", "index_uuid": "34255f64-5717-4562-b3fc-2c963f66afa6", - "columns": ["my_column"], + "columns": ["metadata.`my.column`"], "index_status": "done", }, ] @@ -3544,7 +3544,7 @@ mod tests { IndexConfig { name: "my_idx".into(), index_type: IndexType::LabelList, - columns: vec!["my_column".into()], + columns: vec!["metadata.`my.column`".into()], }, ]; assert_eq!(indices, expected); diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index fa8edd2af..7a2417822 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -2688,16 +2688,13 @@ impl BaseTable for NativeTable { message: "Multi-column (composite) indices are not yet supported".to_string(), }); } - - let dataset = self.dataset.get().await?; + self.dataset.ensure_mutable()?; + let mut dataset = (*self.dataset.get().await?).clone(); let (column, field) = Self::resolve_index_field(dataset.schema(), &opts.columns[0])?; - drop(dataset); let lance_idx_params = self.make_index_params(&field, opts.index.clone()).await?; let index_type = self.get_index_type_for_field(&field, &opts.index); let columns = [column.as_str()]; - self.dataset.ensure_mutable()?; - let mut dataset = (*self.dataset.get().await?).clone(); let mut builder = dataset .create_index_builder(&columns, index_type, lance_idx_params.as_ref()) .train(opts.train) @@ -2815,63 +2812,88 @@ impl BaseTable for NativeTable { async fn list_indices(&self) -> Result> { let dataset = self.dataset.get().await?; let indices = dataset.load_indices().await?; - let results = futures::stream::iter(indices.as_slice()).then(|idx| async { - - // skip Lance internal indexes - if idx.name == FRAG_REUSE_INDEX_NAME { - return None; - } - - let stats = match dataset.index_statistics(idx.name.as_str()).await { - Ok(stats) => stats, - Err(e) => { - log::warn!("Failed to get statistics for index {} ({}): {}", idx.name, idx.uuid, e); + let results = futures::stream::iter(indices.as_slice()) + .then(|idx| async { + // skip Lance internal indexes + if idx.name == FRAG_REUSE_INDEX_NAME { return None; } - }; - let stats: serde_json::Value = match serde_json::from_str(&stats) { - Ok(stats) => stats, - Err(e) => { - log::warn!("Failed to deserialize index statistics for index {} ({}): {}", idx.name, idx.uuid, e); - return None; - } - }; - - let Some(index_type) = stats.get("index_type").and_then(|v| v.as_str()) else { - log::warn!("Index statistics was missing 'index_type' field for index {} ({})", idx.name, idx.uuid); - return None; - }; - - let index_type: crate::index::IndexType = match index_type.parse() { - Ok(index_type) => index_type, - Err(e) => { - log::warn!("Failed to parse index type for index {} ({}): {}", idx.name, idx.uuid, e); - return None; - } - }; - - let mut columns = Vec::with_capacity(idx.fields.len()); - for field_id in &idx.fields { - let column = match dataset.schema().field_path(*field_id) { - Ok(column) => column, + let stats = match dataset.index_statistics(idx.name.as_str()).await { + Ok(stats) => stats, Err(e) => { log::warn!( - "The index {} ({}) referenced a field with id {} which does not exist in the schema: {}", + "Failed to get statistics for index {} ({}): {}", idx.name, idx.uuid, - field_id, e ); return None; } }; - columns.push(column); - } - let name = idx.name.clone(); - Some(IndexConfig { index_type, columns, name }) - }).collect::>().await; + let stats: serde_json::Value = match serde_json::from_str(&stats) { + Ok(stats) => stats, + Err(e) => { + log::warn!( + "Failed to deserialize index statistics for index {} ({}): {}", + idx.name, + idx.uuid, + e + ); + return None; + } + }; + + let Some(index_type) = stats.get("index_type").and_then(|v| v.as_str()) else { + log::warn!( + "Index statistics was missing 'index_type' field for index {} ({})", + idx.name, + idx.uuid + ); + return None; + }; + + let index_type: crate::index::IndexType = match index_type.parse() { + Ok(index_type) => index_type, + Err(e) => { + log::warn!( + "Failed to parse index type for index {} ({}): {}", + idx.name, + idx.uuid, + e + ); + return None; + } + }; + + let mut columns = Vec::with_capacity(idx.fields.len()); + for field_id in &idx.fields { + let field_path = match dataset.schema().field_path(*field_id) { + Ok(field_path) => field_path, + Err(e) => { + log::warn!( + "Failed to resolve field path for index {} ({}) field id {}: {}", + idx.name, + idx.uuid, + field_id, + e + ); + return None; + } + }; + columns.push(field_path); + } + + let name = idx.name.clone(); + Some(IndexConfig { + index_type, + columns, + name, + }) + }) + .collect::>() + .await; Ok(results.into_iter().flatten().collect()) } @@ -3074,6 +3096,7 @@ pub struct FragmentSummaryStats { #[cfg(test)] #[allow(deprecated)] mod tests { + use std::collections::HashMap; use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration;