fix: return canonical nested index paths (#3413)

Index metadata APIs now resolve stored field ids back to Lance canonical
field paths instead of leaf names, so nested indexes such as
`metadata.user_id` and escaped literal-dot fields round-trip through
`list_indices()`. Native index creation also canonicalizes the input
path before handing it to Lance, keeping local metadata consistent with
the field-path contract while remote responses continue to expose
server-provided canonical columns.

Fixes #3403.
This commit is contained in:
Xuanwo
2026-05-21 00:20:47 +08:00
committed by GitHub
parent 2d5298b6ee
commit 2eba7ebd02
4 changed files with 120 additions and 62 deletions

View File

@@ -105,6 +105,46 @@ async def test_create_scalar_index(some_table: AsyncTable):
assert len(indices) == 0
@pytest.mark.asyncio
async def test_create_nested_scalar_index_lists_canonical_paths(db_async):
metadata_type = pa.struct(
[
pa.field("user_id", pa.int32()),
pa.field("user.id", pa.int32()),
]
)
data = pa.Table.from_arrays(
[
pa.array([1, 2, 3], type=pa.int32()),
pa.array(
[
{"user_id": 10, "user.id": 100},
{"user_id": 20, "user.id": 200},
{"user_id": 30, "user.id": 300},
],
type=metadata_type,
),
],
names=["user_id", "metadata"],
)
table = await db_async.create_table("nested_scalar_index", data)
await table.create_index("user_id", config=BTree(), name="top_user_id_idx")
await table.create_index(
"metadata.user_id", config=BTree(), name="nested_user_id_idx"
)
await table.create_index(
"metadata.`user.id`", config=BTree(), name="escaped_user_id_idx"
)
columns_by_name = {
index.name: index.columns for index in await table.list_indices()
}
assert columns_by_name["top_user_id_idx"] == ["user_id"]
assert columns_by_name["nested_user_id_idx"] == ["metadata.user_id"]
assert columns_by_name["escaped_user_id_idx"] == ["metadata.`user.id`"]
@pytest.mark.asyncio
async def test_create_fixed_size_binary_index(some_table: AsyncTable):
await some_table.create_index("fsb", config=BTree())

View File

@@ -23,17 +23,12 @@ impl VectorIndex {
.fields
.iter()
.map(|field_id| {
manifest
.schema
.field_by_id(*field_id)
.unwrap_or_else(|| {
panic!(
"field {field_id} of index {} must exist in schema",
index.name
)
})
.name
.clone()
manifest.schema.field_path(*field_id).unwrap_or_else(|_| {
panic!(
"field {field_id} of index {} must exist in schema",
index.name
)
})
})
.collect();
Self {

View File

@@ -3505,7 +3505,7 @@ mod tests {
{
"index_name": "my_idx",
"index_uuid": "34255f64-5717-4562-b3fc-2c963f66afa6",
"columns": ["my_column"],
"columns": ["metadata.`my.column`"],
"index_status": "done",
},
]
@@ -3544,7 +3544,7 @@ mod tests {
IndexConfig {
name: "my_idx".into(),
index_type: IndexType::LabelList,
columns: vec!["my_column".into()],
columns: vec!["metadata.`my.column`".into()],
},
];
assert_eq!(indices, expected);

View File

@@ -2688,16 +2688,13 @@ impl BaseTable for NativeTable {
message: "Multi-column (composite) indices are not yet supported".to_string(),
});
}
let dataset = self.dataset.get().await?;
self.dataset.ensure_mutable()?;
let mut dataset = (*self.dataset.get().await?).clone();
let (column, field) = Self::resolve_index_field(dataset.schema(), &opts.columns[0])?;
drop(dataset);
let lance_idx_params = self.make_index_params(&field, opts.index.clone()).await?;
let index_type = self.get_index_type_for_field(&field, &opts.index);
let columns = [column.as_str()];
self.dataset.ensure_mutable()?;
let mut dataset = (*self.dataset.get().await?).clone();
let mut builder = dataset
.create_index_builder(&columns, index_type, lance_idx_params.as_ref())
.train(opts.train)
@@ -2815,63 +2812,88 @@ impl BaseTable for NativeTable {
async fn list_indices(&self) -> Result<Vec<IndexConfig>> {
let dataset = self.dataset.get().await?;
let indices = dataset.load_indices().await?;
let results = futures::stream::iter(indices.as_slice()).then(|idx| async {
// skip Lance internal indexes
if idx.name == FRAG_REUSE_INDEX_NAME {
return None;
}
let stats = match dataset.index_statistics(idx.name.as_str()).await {
Ok(stats) => stats,
Err(e) => {
log::warn!("Failed to get statistics for index {} ({}): {}", idx.name, idx.uuid, e);
let results = futures::stream::iter(indices.as_slice())
.then(|idx| async {
// skip Lance internal indexes
if idx.name == FRAG_REUSE_INDEX_NAME {
return None;
}
};
let stats: serde_json::Value = match serde_json::from_str(&stats) {
Ok(stats) => stats,
Err(e) => {
log::warn!("Failed to deserialize index statistics for index {} ({}): {}", idx.name, idx.uuid, e);
return None;
}
};
let Some(index_type) = stats.get("index_type").and_then(|v| v.as_str()) else {
log::warn!("Index statistics was missing 'index_type' field for index {} ({})", idx.name, idx.uuid);
return None;
};
let index_type: crate::index::IndexType = match index_type.parse() {
Ok(index_type) => index_type,
Err(e) => {
log::warn!("Failed to parse index type for index {} ({}): {}", idx.name, idx.uuid, e);
return None;
}
};
let mut columns = Vec::with_capacity(idx.fields.len());
for field_id in &idx.fields {
let column = match dataset.schema().field_path(*field_id) {
Ok(column) => column,
let stats = match dataset.index_statistics(idx.name.as_str()).await {
Ok(stats) => stats,
Err(e) => {
log::warn!(
"The index {} ({}) referenced a field with id {} which does not exist in the schema: {}",
"Failed to get statistics for index {} ({}): {}",
idx.name,
idx.uuid,
field_id,
e
);
return None;
}
};
columns.push(column);
}
let name = idx.name.clone();
Some(IndexConfig { index_type, columns, name })
}).collect::<Vec<_>>().await;
let stats: serde_json::Value = match serde_json::from_str(&stats) {
Ok(stats) => stats,
Err(e) => {
log::warn!(
"Failed to deserialize index statistics for index {} ({}): {}",
idx.name,
idx.uuid,
e
);
return None;
}
};
let Some(index_type) = stats.get("index_type").and_then(|v| v.as_str()) else {
log::warn!(
"Index statistics was missing 'index_type' field for index {} ({})",
idx.name,
idx.uuid
);
return None;
};
let index_type: crate::index::IndexType = match index_type.parse() {
Ok(index_type) => index_type,
Err(e) => {
log::warn!(
"Failed to parse index type for index {} ({}): {}",
idx.name,
idx.uuid,
e
);
return None;
}
};
let mut columns = Vec::with_capacity(idx.fields.len());
for field_id in &idx.fields {
let field_path = match dataset.schema().field_path(*field_id) {
Ok(field_path) => field_path,
Err(e) => {
log::warn!(
"Failed to resolve field path for index {} ({}) field id {}: {}",
idx.name,
idx.uuid,
field_id,
e
);
return None;
}
};
columns.push(field_path);
}
let name = idx.name.clone();
Some(IndexConfig {
index_type,
columns,
name,
})
})
.collect::<Vec<_>>()
.await;
Ok(results.into_iter().flatten().collect())
}
@@ -3074,6 +3096,7 @@ pub struct FragmentSummaryStats {
#[cfg(test)]
#[allow(deprecated)]
mod tests {
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration;