diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 8f39b5c71..79c1ef136 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -2542,11 +2542,6 @@ class LanceTable(Table): "at a time. To search over multiple text fields, create a " "separate FTS index for each field." ) - if "." in field_names: - raise ValueError( - "Native FTS indexes can only be created on top-level fields. " - f"Received nested field path: {field_names!r}." - ) if tokenizer_name is None: tokenizer_configs = { diff --git a/python/python/tests/test_fts.py b/python/python/tests/test_fts.py index acd362a09..6892377cb 100644 --- a/python/python/tests/test_fts.py +++ b/python/python/tests/test_fts.py @@ -563,8 +563,19 @@ def test_create_index_multiple_columns(tmp_path, table): def test_nested_schema(tmp_path, table): - with pytest.raises(ValueError, match="top-level fields"): - table.create_fts_index("nested.text") + table.create_fts_index("nested.text") + indices = table.list_indices() + assert len(indices) == 1 + assert indices[0].index_type == "FTS" + assert indices[0].columns == ["nested.text"] + + results = ( + table.search("puppy", query_type="fts", fts_columns="nested.text") + .limit(5) + .to_list() + ) + assert len(results) > 0 + assert all("puppy" in row["nested"]["text"] for row in results) def test_search_index_with_filter(table): diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py index fcb30f791..74d63d311 100644 --- a/python/python/tests/test_table.py +++ b/python/python/tests/test_table.py @@ -1890,6 +1890,55 @@ def test_create_scalar_index(mem_db: DBConnection): assert scalar_index.name == "custom_y_index" +def test_create_index_nested_field_paths(mem_db: DBConnection): + schema = pa.schema( + [ + pa.field("metadata", pa.struct([pa.field("user_id", pa.int32())])), + pa.field( + "image", + pa.struct([pa.field("embedding", pa.list_(pa.float32(), 2))]), + ), + ] + ) + data = pa.Table.from_pylist( + [ + { + "metadata": {"user_id": i}, + "image": {"embedding": [float(i), float(i + 1)]}, + } + for i in range(256) + ], + schema=schema, + ) + table = mem_db.create_table("nested_index_paths", data=data) + + table.create_scalar_index("metadata.user_id", name="metadata_user_id_idx") + table.create_index( + vector_column_name="image.embedding", + num_partitions=1, + num_sub_vectors=1, + name="image_embedding_idx", + ) + + indices = sorted(table.list_indices(), key=lambda idx: idx.name) + assert [(idx.name, idx.index_type, idx.columns) for idx in indices] == [ + ("image_embedding_idx", "IvfPq", ["image.embedding"]), + ("metadata_user_id_idx", "BTree", ["metadata.user_id"]), + ] + + vector_results = ( + table.search([0.0, 1.0], vector_column_name="image.embedding") + .limit(1) + .to_list() + ) + assert len(vector_results) == 1 + assert vector_results[0]["metadata"]["user_id"] == 0 + + filtered_results = table.search().where("metadata.user_id = 42").limit(1).to_list() + assert len(filtered_results) == 1 + assert filtered_results[0]["metadata"]["user_id"] == 42 + + def test_empty_query(mem_db: DBConnection): table = mem_db.create_table( "my_table", diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index 9398fb4e1..fa8edd2af 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -2171,6 +2171,33 @@ impl NativeTable { } } + fn resolve_index_field( + schema: &lance_core::datatypes::Schema, + column: &str, + ) -> Result<(String, Field)> { + lance_core::datatypes::parse_field_path(column).map_err(|e| Error::InvalidInput { + message: format!("Invalid field path `{}`: {}", column, e), + })?; + + let field_path = schema + .resolve_case_insensitive(column) + .ok_or_else(|| Error::Schema { + message: format!( + "Field path `{}` not found in schema. Available field paths: {}", + column, + schema.field_paths().join(", ") + ), + })?; + let field = field_path.last().expect("field path should be non-empty"); + let path_segments = field_path + .iter() + .map(|field| field.name.as_str()) + .collect::>(); + let canonical_path = lance_core::datatypes::format_field_path(&path_segments); + + Ok((canonical_path, Field::from(*field))) + } + // Convert LanceDB Index to Lance IndexParams async fn make_index_params( &self, @@ -2661,13 +2688,14 @@ impl BaseTable for NativeTable { message: "Multi-column (composite) indices are not yet supported".to_string(), }); } - let schema = self.schema().await?; - let field = schema.field_with_name(&opts.columns[0])?; + let dataset = self.dataset.get().await?; + let (column, field) = Self::resolve_index_field(dataset.schema(), &opts.columns[0])?; + drop(dataset); - let lance_idx_params = self.make_index_params(field, opts.index.clone()).await?; - let index_type = self.get_index_type_for_field(field, &opts.index); - let columns = [field.name().as_str()]; + let lance_idx_params = self.make_index_params(&field, opts.index.clone()).await?; + let index_type = self.get_index_type_for_field(&field, &opts.index); + let columns = [column.as_str()]; self.dataset.ensure_mutable()?; let mut dataset = (*self.dataset.get().await?).clone(); let mut builder = dataset @@ -2825,11 +2853,20 @@ impl BaseTable for NativeTable { let mut columns = Vec::with_capacity(idx.fields.len()); for field_id in &idx.fields { - let Some(field) = dataset.schema().field_by_id(*field_id) else { - log::warn!("The index {} ({}) referenced a field with id {} which does not exist in the schema", idx.name, idx.uuid, field_id); - return None; + let column = match dataset.schema().field_path(*field_id) { + Ok(column) => column, + Err(e) => { + log::warn!( + "The index {} ({}) referenced a field with id {} which does not exist in the schema: {}", + idx.name, + idx.uuid, + field_id, + e + ); + return None; + } }; - columns.push(field.name.clone()); + columns.push(column); } let name = idx.name.clone(); @@ -3042,8 +3079,8 @@ mod tests { use std::time::Duration; use arrow_array::{ - Array, BooleanArray, FixedSizeListArray, Int32Array, LargeStringArray, RecordBatch, - RecordBatchIterator, RecordBatchReader, StringArray, + Array, ArrayRef, BooleanArray, FixedSizeListArray, Int32Array, LargeStringArray, + RecordBatch, RecordBatchIterator, RecordBatchReader, StringArray, StructArray, builder::{ListBuilder, StringBuilder}, }; use arrow_array::{BinaryArray, LargeBinaryArray}; @@ -3063,6 +3100,7 @@ mod tests { use crate::query::Select; use crate::query::{ExecutableQuery, QueryBase}; use crate::test_utils::connection::new_test_connection; + use lance_index::scalar::FullTextSearchQuery; #[tokio::test] async fn test_open() { let tmp_dir = tempdir().unwrap(); @@ -3650,6 +3688,203 @@ mod tests { assert_eq!(stats.num_unindexed_rows, 0); } + #[tokio::test] + async fn test_create_index_nested_field_paths() { + let tmp_dir = tempdir().unwrap(); + let uri = tmp_dir.path().to_str().unwrap(); + let conn = ConnectBuilder::new(uri).execute().await.unwrap(); + + let num_rows = 512; + let dimension = 8; + + let metadata = Arc::new(StructArray::from(vec![( + Arc::new(Field::new("user_id", DataType::Int32, false)), + Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef, + )])); + + let vector_values = arrow_array::Float32Array::from_iter_values( + (0..num_rows * dimension).map(|v| v as f32), + ); + let embeddings = + Arc::new(create_fixed_size_list(vector_values, dimension).unwrap()) as ArrayRef; + let image = Arc::new(StructArray::from(vec![( + Arc::new(Field::new( + "embedding", + embeddings.data_type().clone(), + false, + )), + embeddings, + )])); + + let payload = Arc::new(StructArray::from(vec![( + Arc::new(Field::new("text", DataType::Utf8, false)), + Arc::new(StringArray::from_iter_values( + (0..num_rows).map(|i| format!("document {}", i)), + )) as ArrayRef, + )])); + + let meta_data = Arc::new(StructArray::from(vec![( + Arc::new(Field::new("user-id", DataType::Int32, false)), + Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef, + )])); + + let literal = Arc::new(StructArray::from(vec![( + Arc::new(Field::new("a.b", DataType::Int32, false)), + Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef, + )])); + + let schema = Arc::new(Schema::new(vec![ + Field::new("metadata", metadata.data_type().clone(), false), + Field::new("image", image.data_type().clone(), false), + Field::new("payload", payload.data_type().clone(), false), + Field::new("meta-data", meta_data.data_type().clone(), false), + Field::new("literal", literal.data_type().clone(), false), + ])); + let batch = + RecordBatch::try_new(schema, vec![metadata, image, payload, meta_data, literal]) + .unwrap(); + + let table = conn + .create_table("nested_index_paths", batch) + .execute() + .await + .unwrap(); + + table + .create_index( + &["metadata.user_id"], + Index::BTree(BTreeIndexBuilder::default()), + ) + .name("metadata_user_id_idx".to_string()) + .execute() + .await + .unwrap(); + table + .create_index(&["image.embedding"], Index::Auto) + .name("image_embedding_idx".to_string()) + .execute() + .await + .unwrap(); + table + .create_index(&["payload.text"], Index::FTS(Default::default())) + .name("payload_text_idx".to_string()) + .execute() + .await + .unwrap(); + table + .create_index( + &["`meta-data`.`user-id`"], + Index::BTree(BTreeIndexBuilder::default()), + ) + .name("escaped_names_idx".to_string()) + .execute() + .await + .unwrap(); + table + .create_index( + &["literal.`a.b`"], + Index::BTree(BTreeIndexBuilder::default()), + ) + .name("literal_dot_idx".to_string()) + .execute() + .await + .unwrap(); + + let mut index_configs = table.list_indices().await.unwrap(); + index_configs.sort_by(|left, right| left.name.cmp(&right.name)); + + let indexed_columns = index_configs + .iter() + .map(|index| { + ( + index.name.as_str(), + index.columns.as_slice(), + index.index_type.clone(), + ) + }) + .collect::>(); + assert_eq!( + indexed_columns, + vec![ + ( + "escaped_names_idx", + &["`meta-data`.`user-id`".to_string()][..], + crate::index::IndexType::BTree, + ), + ( + "image_embedding_idx", + &["image.embedding".to_string()][..], + crate::index::IndexType::IvfPq, + ), + ( + "literal_dot_idx", + &["literal.`a.b`".to_string()][..], + crate::index::IndexType::BTree, + ), + ( + "metadata_user_id_idx", + &["metadata.user_id".to_string()][..], + crate::index::IndexType::BTree, + ), + ( + "payload_text_idx", + &["payload.text".to_string()][..], + crate::index::IndexType::FTS, + ), + ] + ); + + let vector_results = table + .query() + .nearest_to(&[0.0; 8]) + .unwrap() + .column("image.embedding") + .limit(1) + .execute() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + assert_eq!( + vector_results + .iter() + .map(|batch| batch.num_rows()) + .sum::(), + 1 + ); + + let fts_results = table + .query() + .full_text_search(FullTextSearchQuery::new("document".to_string())) + .limit(5) + .execute() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + assert!(!fts_results.is_empty()); + + let filtered_results = table + .query() + .only_if("metadata.user_id = 42") + .limit(1) + .execute() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + assert_eq!( + filtered_results + .iter() + .map(|batch| batch.num_rows()) + .sum::(), + 1 + ); + } + #[tokio::test] async fn test_create_bitmap_index() { let tmp_dir = tempdir().unwrap();