fix: discover nested vector columns by default (#3423)

LanceDB default vector column discovery only considered top-level
fields, so tables with a single nested vector leaf still required users
to pass an explicit field path. This updates Rust and Python discovery
to recurse into struct fields, return canonical field paths, and
preserve actionable errors when no default or multiple defaults exist.

The explicit nested path flow for index creation and search remains
supported across Rust, Python, and Node, with regression coverage for
single nested vector leaves, multiple candidate leaves, and schemas
without vector leaves.

Closes #3405.
This commit is contained in:
Xuanwo
2026-05-21 19:02:41 +08:00
committed by GitHub
parent 55ae6197c1
commit d5dc4c0f06
6 changed files with 415 additions and 34 deletions

View File

@@ -27,7 +27,9 @@ use crate::table::UpdateResult;
use crate::table::query::create_multi_vector_plan;
use crate::table::{AnyQuery, Filter, PreprocessingOutput, TableStatistics};
use crate::utils::background_cache::BackgroundCache;
use crate::utils::{supported_btree_data_type, supported_vector_data_type};
use crate::utils::{
resolve_arrow_field_path, supported_btree_data_type, supported_vector_data_type,
};
use crate::{DistanceType, Error};
use crate::{
error::Result,
@@ -1563,11 +1565,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
Index::FTS(p) => ("FTS", Some(to_json(p)?)),
Index::Auto => {
let schema = self.schema().await?;
let field = schema
.field_with_name(&column)
.map_err(|_| Error::InvalidInput {
message: format!("Column {} not found in schema", column),
})?;
let field = resolve_arrow_field_path(&schema, &column)?;
if supported_vector_data_type(field.data_type()) {
body[METRIC_TYPE_KEY] =
serde_json::Value::String(DistanceType::L2.to_string().to_lowercase());

View File

@@ -3877,6 +3877,25 @@ mod tests {
1
);
let default_vector_results = table
.query()
.nearest_to(&[0.0; 8])
.unwrap()
.limit(1)
.execute()
.await
.unwrap()
.try_collect::<Vec<_>>()
.await
.unwrap();
assert_eq!(
default_vector_results
.iter()
.map(|batch| batch.num_rows())
.sum::<usize>(),
1
);
let fts_results = table
.query()
.full_text_search(FullTextSearchQuery::new("document".to_string()))

View File

@@ -6,7 +6,7 @@ pub(crate) mod background_cache;
use std::sync::Arc;
use arrow_array::RecordBatch;
use arrow_schema::{DataType, Schema, SchemaRef};
use arrow_schema::{DataType, Field, Fields, Schema, SchemaRef};
use datafusion_common::{DataFusionError, Result as DataFusionResult};
use datafusion_execution::RecordBatchStream;
use futures::{FutureExt, Stream};
@@ -152,14 +152,10 @@ pub fn validate_namespace(namespace: &[String]) -> Result<()> {
/// Find one default column to create index or perform vector query.
pub(crate) fn default_vector_column(schema: &Schema, dim: Option<i32>) -> Result<String> {
// Try to find a vector column.
let candidates = schema
.fields()
.iter()
.filter_map(|field| match infer_vector_dim(field.data_type()) {
Ok(d) if dim.is_none() || dim == Some(d as i32) => Some(field.name()),
_ => None,
})
.collect::<Vec<_>>();
let mut candidates = Vec::new();
for field in schema.fields() {
collect_vector_columns(field, &mut Vec::new(), dim, &mut candidates);
}
if candidates.is_empty() {
Err(Error::InvalidInput {
message: format!(
@@ -180,6 +176,63 @@ pub(crate) fn default_vector_column(schema: &Schema, dim: Option<i32>) -> Result
}
}
fn collect_vector_columns(
field: &Field,
path: &mut Vec<String>,
dim: Option<i32>,
candidates: &mut Vec<String>,
) {
path.push(field.name().clone());
match infer_vector_dim(field.data_type()) {
Ok(d) if dim.is_none() || dim == Some(d as i32) => {
let path_segments = path.iter().map(String::as_str).collect::<Vec<_>>();
candidates.push(lance_core::datatypes::format_field_path(&path_segments));
}
_ => {
if let DataType::Struct(fields) = field.data_type() {
for child in fields {
collect_vector_columns(child, path, dim, candidates);
}
}
}
}
path.pop();
}
pub(crate) fn resolve_arrow_field_path(schema: &Schema, column: &str) -> Result<Field> {
let segments =
lance_core::datatypes::parse_field_path(column).map_err(|e| Error::InvalidInput {
message: format!("Invalid field path `{}`: {}", column, e),
})?;
let mut fields = schema.fields();
for (idx, segment) in segments.iter().enumerate() {
let field = find_field(fields, segment).ok_or_else(|| Error::Schema {
message: format!("Field path `{}` not found in schema", column),
})?;
if idx + 1 == segments.len() {
return Ok(field.clone());
}
fields = match field.data_type() {
DataType::Struct(fields) => fields,
_ => {
return Err(Error::Schema {
message: format!("Field path `{}` not found in schema", column),
});
}
};
}
unreachable!("parse_field_path returns at least one segment")
}
fn find_field<'a>(fields: &'a Fields, name: &str) -> Option<&'a Field> {
fields
.iter()
.find(|field| field.name() == name)
.map(|field| field.as_ref())
}
pub fn supported_btree_data_type(dtype: &DataType) -> bool {
dtype.is_integer()
|| dtype.is_floating()
@@ -450,6 +503,49 @@ mod tests {
"vec"
);
let schema_with_nested_vec_col = Schema::new(vec![
Field::new("id", DataType::Int16, true),
Field::new(
"image",
DataType::Struct(
vec![Field::new(
"embedding",
DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::Float32, false)),
10,
),
false,
)]
.into(),
),
false,
),
]);
assert_eq!(
default_vector_column(&schema_with_nested_vec_col, None).unwrap(),
"image.embedding"
);
let schema_with_escaped_nested_vec_col = Schema::new(vec![Field::new(
"image-meta",
DataType::Struct(
vec![Field::new(
"embedding.v1",
DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::Float32, false)),
10,
),
false,
)]
.into(),
),
false,
)]);
assert_eq!(
default_vector_column(&schema_with_escaped_nested_vec_col, None).unwrap(),
"`image-meta`.`embedding.v1`"
);
let multi_vec_col = Schema::new(vec![
Field::new("id", DataType::Int16, true),
Field::new(
@@ -469,6 +565,48 @@ mod tests {
.to_string()
.contains("More than one")
);
let multi_nested_vec_col = Schema::new(vec![
Field::new(
"image",
DataType::Struct(
vec![Field::new(
"embedding",
DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::Float32, false)),
10,
),
false,
)]
.into(),
),
false,
),
Field::new(
"text",
DataType::Struct(
vec![Field::new(
"embedding",
DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::Float32, false)),
50,
),
false,
)]
.into(),
),
false,
),
]);
assert_eq!(
default_vector_column(&multi_nested_vec_col, Some(50)).unwrap(),
"text.embedding"
);
let err = default_vector_column(&multi_nested_vec_col, None)
.unwrap_err()
.to_string();
assert!(err.contains("image.embedding"));
assert!(err.contains("text.embedding"));
}
#[test]