mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-20 21:40:43 +00:00
fix: support nested field paths in native index creation (#3408)
Native index creation was resolving requested columns through top-level Arrow schema lookup before handing the request to Lance, which rejected nested paths and could collapse a nested field to its leaf name. This PR resolves index targets with Lance field-path semantics, passes the canonical path through to Lance, and reports indexed columns from field ids as canonical full paths. This also removes the Python native FTS guard that rejected dotted paths so scalar, vector, and FTS index creation share the same nested-field contract. Related to #3402.
This commit is contained in:
@@ -2542,11 +2542,6 @@ class LanceTable(Table):
|
||||
"at a time. To search over multiple text fields, create a "
|
||||
"separate FTS index for each field."
|
||||
)
|
||||
if "." in field_names:
|
||||
raise ValueError(
|
||||
"Native FTS indexes can only be created on top-level fields. "
|
||||
f"Received nested field path: {field_names!r}."
|
||||
)
|
||||
|
||||
if tokenizer_name is None:
|
||||
tokenizer_configs = {
|
||||
|
||||
@@ -563,8 +563,19 @@ def test_create_index_multiple_columns(tmp_path, table):
|
||||
|
||||
|
||||
def test_nested_schema(tmp_path, table):
|
||||
with pytest.raises(ValueError, match="top-level fields"):
|
||||
table.create_fts_index("nested.text")
|
||||
table.create_fts_index("nested.text")
|
||||
indices = table.list_indices()
|
||||
assert len(indices) == 1
|
||||
assert indices[0].index_type == "FTS"
|
||||
assert indices[0].columns == ["nested.text"]
|
||||
|
||||
results = (
|
||||
table.search("puppy", query_type="fts", fts_columns="nested.text")
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) > 0
|
||||
assert all("puppy" in row["nested"]["text"] for row in results)
|
||||
|
||||
|
||||
def test_search_index_with_filter(table):
|
||||
|
||||
@@ -1890,6 +1890,55 @@ def test_create_scalar_index(mem_db: DBConnection):
|
||||
assert scalar_index.name == "custom_y_index"
|
||||
|
||||
|
||||
def test_create_index_nested_field_paths(mem_db: DBConnection):
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("metadata", pa.struct([pa.field("user_id", pa.int32())])),
|
||||
pa.field(
|
||||
"image",
|
||||
pa.struct([pa.field("embedding", pa.list_(pa.float32(), 2))]),
|
||||
),
|
||||
]
|
||||
)
|
||||
data = pa.Table.from_pylist(
|
||||
[
|
||||
{
|
||||
"metadata": {"user_id": i},
|
||||
"image": {"embedding": [float(i), float(i + 1)]},
|
||||
}
|
||||
for i in range(256)
|
||||
],
|
||||
schema=schema,
|
||||
)
|
||||
table = mem_db.create_table("nested_index_paths", data=data)
|
||||
|
||||
table.create_scalar_index("metadata.user_id", name="metadata_user_id_idx")
|
||||
table.create_index(
|
||||
vector_column_name="image.embedding",
|
||||
num_partitions=1,
|
||||
num_sub_vectors=1,
|
||||
name="image_embedding_idx",
|
||||
)
|
||||
|
||||
indices = sorted(table.list_indices(), key=lambda idx: idx.name)
|
||||
assert [(idx.name, idx.index_type, idx.columns) for idx in indices] == [
|
||||
("image_embedding_idx", "IvfPq", ["image.embedding"]),
|
||||
("metadata_user_id_idx", "BTree", ["metadata.user_id"]),
|
||||
]
|
||||
|
||||
vector_results = (
|
||||
table.search([0.0, 1.0], vector_column_name="image.embedding")
|
||||
.limit(1)
|
||||
.to_list()
|
||||
)
|
||||
assert len(vector_results) == 1
|
||||
assert vector_results[0]["metadata"]["user_id"] == 0
|
||||
|
||||
filtered_results = table.search().where("metadata.user_id = 42").limit(1).to_list()
|
||||
assert len(filtered_results) == 1
|
||||
assert filtered_results[0]["metadata"]["user_id"] == 42
|
||||
|
||||
|
||||
def test_empty_query(mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
"my_table",
|
||||
|
||||
@@ -2171,6 +2171,33 @@ impl NativeTable {
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_index_field(
|
||||
schema: &lance_core::datatypes::Schema,
|
||||
column: &str,
|
||||
) -> Result<(String, Field)> {
|
||||
lance_core::datatypes::parse_field_path(column).map_err(|e| Error::InvalidInput {
|
||||
message: format!("Invalid field path `{}`: {}", column, e),
|
||||
})?;
|
||||
|
||||
let field_path = schema
|
||||
.resolve_case_insensitive(column)
|
||||
.ok_or_else(|| Error::Schema {
|
||||
message: format!(
|
||||
"Field path `{}` not found in schema. Available field paths: {}",
|
||||
column,
|
||||
schema.field_paths().join(", ")
|
||||
),
|
||||
})?;
|
||||
let field = field_path.last().expect("field path should be non-empty");
|
||||
let path_segments = field_path
|
||||
.iter()
|
||||
.map(|field| field.name.as_str())
|
||||
.collect::<Vec<_>>();
|
||||
let canonical_path = lance_core::datatypes::format_field_path(&path_segments);
|
||||
|
||||
Ok((canonical_path, Field::from(*field)))
|
||||
}
|
||||
|
||||
// Convert LanceDB Index to Lance IndexParams
|
||||
async fn make_index_params(
|
||||
&self,
|
||||
@@ -2661,13 +2688,14 @@ impl BaseTable for NativeTable {
|
||||
message: "Multi-column (composite) indices are not yet supported".to_string(),
|
||||
});
|
||||
}
|
||||
let schema = self.schema().await?;
|
||||
|
||||
let field = schema.field_with_name(&opts.columns[0])?;
|
||||
let dataset = self.dataset.get().await?;
|
||||
let (column, field) = Self::resolve_index_field(dataset.schema(), &opts.columns[0])?;
|
||||
drop(dataset);
|
||||
|
||||
let lance_idx_params = self.make_index_params(field, opts.index.clone()).await?;
|
||||
let index_type = self.get_index_type_for_field(field, &opts.index);
|
||||
let columns = [field.name().as_str()];
|
||||
let lance_idx_params = self.make_index_params(&field, opts.index.clone()).await?;
|
||||
let index_type = self.get_index_type_for_field(&field, &opts.index);
|
||||
let columns = [column.as_str()];
|
||||
self.dataset.ensure_mutable()?;
|
||||
let mut dataset = (*self.dataset.get().await?).clone();
|
||||
let mut builder = dataset
|
||||
@@ -2825,11 +2853,20 @@ impl BaseTable for NativeTable {
|
||||
|
||||
let mut columns = Vec::with_capacity(idx.fields.len());
|
||||
for field_id in &idx.fields {
|
||||
let Some(field) = dataset.schema().field_by_id(*field_id) else {
|
||||
log::warn!("The index {} ({}) referenced a field with id {} which does not exist in the schema", idx.name, idx.uuid, field_id);
|
||||
return None;
|
||||
let column = match dataset.schema().field_path(*field_id) {
|
||||
Ok(column) => column,
|
||||
Err(e) => {
|
||||
log::warn!(
|
||||
"The index {} ({}) referenced a field with id {} which does not exist in the schema: {}",
|
||||
idx.name,
|
||||
idx.uuid,
|
||||
field_id,
|
||||
e
|
||||
);
|
||||
return None;
|
||||
}
|
||||
};
|
||||
columns.push(field.name.clone());
|
||||
columns.push(column);
|
||||
}
|
||||
|
||||
let name = idx.name.clone();
|
||||
@@ -3042,8 +3079,8 @@ mod tests {
|
||||
use std::time::Duration;
|
||||
|
||||
use arrow_array::{
|
||||
Array, BooleanArray, FixedSizeListArray, Int32Array, LargeStringArray, RecordBatch,
|
||||
RecordBatchIterator, RecordBatchReader, StringArray,
|
||||
Array, ArrayRef, BooleanArray, FixedSizeListArray, Int32Array, LargeStringArray,
|
||||
RecordBatch, RecordBatchIterator, RecordBatchReader, StringArray, StructArray,
|
||||
builder::{ListBuilder, StringBuilder},
|
||||
};
|
||||
use arrow_array::{BinaryArray, LargeBinaryArray};
|
||||
@@ -3063,6 +3100,7 @@ mod tests {
|
||||
use crate::query::Select;
|
||||
use crate::query::{ExecutableQuery, QueryBase};
|
||||
use crate::test_utils::connection::new_test_connection;
|
||||
use lance_index::scalar::FullTextSearchQuery;
|
||||
#[tokio::test]
|
||||
async fn test_open() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
@@ -3650,6 +3688,203 @@ mod tests {
|
||||
assert_eq!(stats.num_unindexed_rows, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_index_nested_field_paths() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let uri = tmp_dir.path().to_str().unwrap();
|
||||
let conn = ConnectBuilder::new(uri).execute().await.unwrap();
|
||||
|
||||
let num_rows = 512;
|
||||
let dimension = 8;
|
||||
|
||||
let metadata = Arc::new(StructArray::from(vec![(
|
||||
Arc::new(Field::new("user_id", DataType::Int32, false)),
|
||||
Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef,
|
||||
)]));
|
||||
|
||||
let vector_values = arrow_array::Float32Array::from_iter_values(
|
||||
(0..num_rows * dimension).map(|v| v as f32),
|
||||
);
|
||||
let embeddings =
|
||||
Arc::new(create_fixed_size_list(vector_values, dimension).unwrap()) as ArrayRef;
|
||||
let image = Arc::new(StructArray::from(vec![(
|
||||
Arc::new(Field::new(
|
||||
"embedding",
|
||||
embeddings.data_type().clone(),
|
||||
false,
|
||||
)),
|
||||
embeddings,
|
||||
)]));
|
||||
|
||||
let payload = Arc::new(StructArray::from(vec![(
|
||||
Arc::new(Field::new("text", DataType::Utf8, false)),
|
||||
Arc::new(StringArray::from_iter_values(
|
||||
(0..num_rows).map(|i| format!("document {}", i)),
|
||||
)) as ArrayRef,
|
||||
)]));
|
||||
|
||||
let meta_data = Arc::new(StructArray::from(vec![(
|
||||
Arc::new(Field::new("user-id", DataType::Int32, false)),
|
||||
Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef,
|
||||
)]));
|
||||
|
||||
let literal = Arc::new(StructArray::from(vec![(
|
||||
Arc::new(Field::new("a.b", DataType::Int32, false)),
|
||||
Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef,
|
||||
)]));
|
||||
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("metadata", metadata.data_type().clone(), false),
|
||||
Field::new("image", image.data_type().clone(), false),
|
||||
Field::new("payload", payload.data_type().clone(), false),
|
||||
Field::new("meta-data", meta_data.data_type().clone(), false),
|
||||
Field::new("literal", literal.data_type().clone(), false),
|
||||
]));
|
||||
let batch =
|
||||
RecordBatch::try_new(schema, vec![metadata, image, payload, meta_data, literal])
|
||||
.unwrap();
|
||||
|
||||
let table = conn
|
||||
.create_table("nested_index_paths", batch)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
table
|
||||
.create_index(
|
||||
&["metadata.user_id"],
|
||||
Index::BTree(BTreeIndexBuilder::default()),
|
||||
)
|
||||
.name("metadata_user_id_idx".to_string())
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
table
|
||||
.create_index(&["image.embedding"], Index::Auto)
|
||||
.name("image_embedding_idx".to_string())
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
table
|
||||
.create_index(&["payload.text"], Index::FTS(Default::default()))
|
||||
.name("payload_text_idx".to_string())
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
table
|
||||
.create_index(
|
||||
&["`meta-data`.`user-id`"],
|
||||
Index::BTree(BTreeIndexBuilder::default()),
|
||||
)
|
||||
.name("escaped_names_idx".to_string())
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
table
|
||||
.create_index(
|
||||
&["literal.`a.b`"],
|
||||
Index::BTree(BTreeIndexBuilder::default()),
|
||||
)
|
||||
.name("literal_dot_idx".to_string())
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut index_configs = table.list_indices().await.unwrap();
|
||||
index_configs.sort_by(|left, right| left.name.cmp(&right.name));
|
||||
|
||||
let indexed_columns = index_configs
|
||||
.iter()
|
||||
.map(|index| {
|
||||
(
|
||||
index.name.as_str(),
|
||||
index.columns.as_slice(),
|
||||
index.index_type.clone(),
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
indexed_columns,
|
||||
vec![
|
||||
(
|
||||
"escaped_names_idx",
|
||||
&["`meta-data`.`user-id`".to_string()][..],
|
||||
crate::index::IndexType::BTree,
|
||||
),
|
||||
(
|
||||
"image_embedding_idx",
|
||||
&["image.embedding".to_string()][..],
|
||||
crate::index::IndexType::IvfPq,
|
||||
),
|
||||
(
|
||||
"literal_dot_idx",
|
||||
&["literal.`a.b`".to_string()][..],
|
||||
crate::index::IndexType::BTree,
|
||||
),
|
||||
(
|
||||
"metadata_user_id_idx",
|
||||
&["metadata.user_id".to_string()][..],
|
||||
crate::index::IndexType::BTree,
|
||||
),
|
||||
(
|
||||
"payload_text_idx",
|
||||
&["payload.text".to_string()][..],
|
||||
crate::index::IndexType::FTS,
|
||||
),
|
||||
]
|
||||
);
|
||||
|
||||
let vector_results = table
|
||||
.query()
|
||||
.nearest_to(&[0.0; 8])
|
||||
.unwrap()
|
||||
.column("image.embedding")
|
||||
.limit(1)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
vector_results
|
||||
.iter()
|
||||
.map(|batch| batch.num_rows())
|
||||
.sum::<usize>(),
|
||||
1
|
||||
);
|
||||
|
||||
let fts_results = table
|
||||
.query()
|
||||
.full_text_search(FullTextSearchQuery::new("document".to_string()))
|
||||
.limit(5)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(!fts_results.is_empty());
|
||||
|
||||
let filtered_results = table
|
||||
.query()
|
||||
.only_if("metadata.user_id = 42")
|
||||
.limit(1)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
filtered_results
|
||||
.iter()
|
||||
.map(|batch| batch.num_rows())
|
||||
.sum::<usize>(),
|
||||
1
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_bitmap_index() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
|
||||
Reference in New Issue
Block a user