From 5bfde47a8eb214edfd83a1c9a378ddd8ed62cfb1 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 20 May 2026 11:15:15 +0800 Subject: [PATCH] fix: support nested field paths in native index creation (#3408) Native index creation was resolving requested columns through top-level Arrow schema lookup before handing the request to Lance, which rejected nested paths and could collapse a nested field to its leaf name. This PR resolves index targets with Lance field-path semantics, passes the canonical path through to Lance, and reports indexed columns from field ids as canonical full paths. This also removes the Python native FTS guard that rejected dotted paths so scalar, vector, and FTS index creation share the same nested-field contract. Related to #3402. --- python/python/lancedb/table.py | 5 - python/python/tests/test_fts.py | 15 +- python/python/tests/test_table.py | 49 ++++++ rust/lancedb/src/table.rs | 257 ++++++++++++++++++++++++++++-- 4 files changed, 308 insertions(+), 18 deletions(-) diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 8f39b5c71..79c1ef136 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -2542,11 +2542,6 @@ class LanceTable(Table): "at a time. To search over multiple text fields, create a " "separate FTS index for each field." ) - if "." in field_names: - raise ValueError( - "Native FTS indexes can only be created on top-level fields. " - f"Received nested field path: {field_names!r}." - ) if tokenizer_name is None: tokenizer_configs = { diff --git a/python/python/tests/test_fts.py b/python/python/tests/test_fts.py index acd362a09..6892377cb 100644 --- a/python/python/tests/test_fts.py +++ b/python/python/tests/test_fts.py @@ -563,8 +563,19 @@ def test_create_index_multiple_columns(tmp_path, table): def test_nested_schema(tmp_path, table): - with pytest.raises(ValueError, match="top-level fields"): - table.create_fts_index("nested.text") + table.create_fts_index("nested.text") + indices = table.list_indices() + assert len(indices) == 1 + assert indices[0].index_type == "FTS" + assert indices[0].columns == ["nested.text"] + + results = ( + table.search("puppy", query_type="fts", fts_columns="nested.text") + .limit(5) + .to_list() + ) + assert len(results) > 0 + assert all("puppy" in row["nested"]["text"] for row in results) def test_search_index_with_filter(table): diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py index fcb30f791..74d63d311 100644 --- a/python/python/tests/test_table.py +++ b/python/python/tests/test_table.py @@ -1890,6 +1890,55 @@ def test_create_scalar_index(mem_db: DBConnection): assert scalar_index.name == "custom_y_index" +def test_create_index_nested_field_paths(mem_db: DBConnection): + schema = pa.schema( + [ + pa.field("metadata", pa.struct([pa.field("user_id", pa.int32())])), + pa.field( + "image", + pa.struct([pa.field("embedding", pa.list_(pa.float32(), 2))]), + ), + ] + ) + data = pa.Table.from_pylist( + [ + { + "metadata": {"user_id": i}, + "image": {"embedding": [float(i), float(i + 1)]}, + } + for i in range(256) + ], + schema=schema, + ) + table = mem_db.create_table("nested_index_paths", data=data) + + table.create_scalar_index("metadata.user_id", name="metadata_user_id_idx") + table.create_index( + vector_column_name="image.embedding", + num_partitions=1, + num_sub_vectors=1, + name="image_embedding_idx", + ) + + indices = sorted(table.list_indices(), key=lambda idx: idx.name) + assert [(idx.name, idx.index_type, idx.columns) for idx in indices] == [ + ("image_embedding_idx", "IvfPq", ["image.embedding"]), + ("metadata_user_id_idx", "BTree", ["metadata.user_id"]), + ] + + vector_results = ( + table.search([0.0, 1.0], vector_column_name="image.embedding") + .limit(1) + .to_list() + ) + assert len(vector_results) == 1 + assert vector_results[0]["metadata"]["user_id"] == 0 + + filtered_results = table.search().where("metadata.user_id = 42").limit(1).to_list() + assert len(filtered_results) == 1 + assert filtered_results[0]["metadata"]["user_id"] == 42 + + def test_empty_query(mem_db: DBConnection): table = mem_db.create_table( "my_table", diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index 9398fb4e1..fa8edd2af 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -2171,6 +2171,33 @@ impl NativeTable { } } + fn resolve_index_field( + schema: &lance_core::datatypes::Schema, + column: &str, + ) -> Result<(String, Field)> { + lance_core::datatypes::parse_field_path(column).map_err(|e| Error::InvalidInput { + message: format!("Invalid field path `{}`: {}", column, e), + })?; + + let field_path = schema + .resolve_case_insensitive(column) + .ok_or_else(|| Error::Schema { + message: format!( + "Field path `{}` not found in schema. Available field paths: {}", + column, + schema.field_paths().join(", ") + ), + })?; + let field = field_path.last().expect("field path should be non-empty"); + let path_segments = field_path + .iter() + .map(|field| field.name.as_str()) + .collect::>(); + let canonical_path = lance_core::datatypes::format_field_path(&path_segments); + + Ok((canonical_path, Field::from(*field))) + } + // Convert LanceDB Index to Lance IndexParams async fn make_index_params( &self, @@ -2661,13 +2688,14 @@ impl BaseTable for NativeTable { message: "Multi-column (composite) indices are not yet supported".to_string(), }); } - let schema = self.schema().await?; - let field = schema.field_with_name(&opts.columns[0])?; + let dataset = self.dataset.get().await?; + let (column, field) = Self::resolve_index_field(dataset.schema(), &opts.columns[0])?; + drop(dataset); - let lance_idx_params = self.make_index_params(field, opts.index.clone()).await?; - let index_type = self.get_index_type_for_field(field, &opts.index); - let columns = [field.name().as_str()]; + let lance_idx_params = self.make_index_params(&field, opts.index.clone()).await?; + let index_type = self.get_index_type_for_field(&field, &opts.index); + let columns = [column.as_str()]; self.dataset.ensure_mutable()?; let mut dataset = (*self.dataset.get().await?).clone(); let mut builder = dataset @@ -2825,11 +2853,20 @@ impl BaseTable for NativeTable { let mut columns = Vec::with_capacity(idx.fields.len()); for field_id in &idx.fields { - let Some(field) = dataset.schema().field_by_id(*field_id) else { - log::warn!("The index {} ({}) referenced a field with id {} which does not exist in the schema", idx.name, idx.uuid, field_id); - return None; + let column = match dataset.schema().field_path(*field_id) { + Ok(column) => column, + Err(e) => { + log::warn!( + "The index {} ({}) referenced a field with id {} which does not exist in the schema: {}", + idx.name, + idx.uuid, + field_id, + e + ); + return None; + } }; - columns.push(field.name.clone()); + columns.push(column); } let name = idx.name.clone(); @@ -3042,8 +3079,8 @@ mod tests { use std::time::Duration; use arrow_array::{ - Array, BooleanArray, FixedSizeListArray, Int32Array, LargeStringArray, RecordBatch, - RecordBatchIterator, RecordBatchReader, StringArray, + Array, ArrayRef, BooleanArray, FixedSizeListArray, Int32Array, LargeStringArray, + RecordBatch, RecordBatchIterator, RecordBatchReader, StringArray, StructArray, builder::{ListBuilder, StringBuilder}, }; use arrow_array::{BinaryArray, LargeBinaryArray}; @@ -3063,6 +3100,7 @@ mod tests { use crate::query::Select; use crate::query::{ExecutableQuery, QueryBase}; use crate::test_utils::connection::new_test_connection; + use lance_index::scalar::FullTextSearchQuery; #[tokio::test] async fn test_open() { let tmp_dir = tempdir().unwrap(); @@ -3650,6 +3688,203 @@ mod tests { assert_eq!(stats.num_unindexed_rows, 0); } + #[tokio::test] + async fn test_create_index_nested_field_paths() { + let tmp_dir = tempdir().unwrap(); + let uri = tmp_dir.path().to_str().unwrap(); + let conn = ConnectBuilder::new(uri).execute().await.unwrap(); + + let num_rows = 512; + let dimension = 8; + + let metadata = Arc::new(StructArray::from(vec![( + Arc::new(Field::new("user_id", DataType::Int32, false)), + Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef, + )])); + + let vector_values = arrow_array::Float32Array::from_iter_values( + (0..num_rows * dimension).map(|v| v as f32), + ); + let embeddings = + Arc::new(create_fixed_size_list(vector_values, dimension).unwrap()) as ArrayRef; + let image = Arc::new(StructArray::from(vec![( + Arc::new(Field::new( + "embedding", + embeddings.data_type().clone(), + false, + )), + embeddings, + )])); + + let payload = Arc::new(StructArray::from(vec![( + Arc::new(Field::new("text", DataType::Utf8, false)), + Arc::new(StringArray::from_iter_values( + (0..num_rows).map(|i| format!("document {}", i)), + )) as ArrayRef, + )])); + + let meta_data = Arc::new(StructArray::from(vec![( + Arc::new(Field::new("user-id", DataType::Int32, false)), + Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef, + )])); + + let literal = Arc::new(StructArray::from(vec![( + Arc::new(Field::new("a.b", DataType::Int32, false)), + Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef, + )])); + + let schema = Arc::new(Schema::new(vec![ + Field::new("metadata", metadata.data_type().clone(), false), + Field::new("image", image.data_type().clone(), false), + Field::new("payload", payload.data_type().clone(), false), + Field::new("meta-data", meta_data.data_type().clone(), false), + Field::new("literal", literal.data_type().clone(), false), + ])); + let batch = + RecordBatch::try_new(schema, vec![metadata, image, payload, meta_data, literal]) + .unwrap(); + + let table = conn + .create_table("nested_index_paths", batch) + .execute() + .await + .unwrap(); + + table + .create_index( + &["metadata.user_id"], + Index::BTree(BTreeIndexBuilder::default()), + ) + .name("metadata_user_id_idx".to_string()) + .execute() + .await + .unwrap(); + table + .create_index(&["image.embedding"], Index::Auto) + .name("image_embedding_idx".to_string()) + .execute() + .await + .unwrap(); + table + .create_index(&["payload.text"], Index::FTS(Default::default())) + .name("payload_text_idx".to_string()) + .execute() + .await + .unwrap(); + table + .create_index( + &["`meta-data`.`user-id`"], + Index::BTree(BTreeIndexBuilder::default()), + ) + .name("escaped_names_idx".to_string()) + .execute() + .await + .unwrap(); + table + .create_index( + &["literal.`a.b`"], + Index::BTree(BTreeIndexBuilder::default()), + ) + .name("literal_dot_idx".to_string()) + .execute() + .await + .unwrap(); + + let mut index_configs = table.list_indices().await.unwrap(); + index_configs.sort_by(|left, right| left.name.cmp(&right.name)); + + let indexed_columns = index_configs + .iter() + .map(|index| { + ( + index.name.as_str(), + index.columns.as_slice(), + index.index_type.clone(), + ) + }) + .collect::>(); + assert_eq!( + indexed_columns, + vec![ + ( + "escaped_names_idx", + &["`meta-data`.`user-id`".to_string()][..], + crate::index::IndexType::BTree, + ), + ( + "image_embedding_idx", + &["image.embedding".to_string()][..], + crate::index::IndexType::IvfPq, + ), + ( + "literal_dot_idx", + &["literal.`a.b`".to_string()][..], + crate::index::IndexType::BTree, + ), + ( + "metadata_user_id_idx", + &["metadata.user_id".to_string()][..], + crate::index::IndexType::BTree, + ), + ( + "payload_text_idx", + &["payload.text".to_string()][..], + crate::index::IndexType::FTS, + ), + ] + ); + + let vector_results = table + .query() + .nearest_to(&[0.0; 8]) + .unwrap() + .column("image.embedding") + .limit(1) + .execute() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + assert_eq!( + vector_results + .iter() + .map(|batch| batch.num_rows()) + .sum::(), + 1 + ); + + let fts_results = table + .query() + .full_text_search(FullTextSearchQuery::new("document".to_string())) + .limit(5) + .execute() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + assert!(!fts_results.is_empty()); + + let filtered_results = table + .query() + .only_if("metadata.user_id = 42") + .limit(1) + .execute() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + assert_eq!( + filtered_results + .iter() + .map(|batch| batch.num_rows()) + .sum::(), + 1 + ); + } + #[tokio::test] async fn test_create_bitmap_index() { let tmp_dir = tempdir().unwrap();