From 566b67a634021b43798d0a74f1fceaa1dea172c7 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 10 Jun 2026 23:53:56 +0800 Subject: [PATCH] fix: support LargeList label list indexes (#3529) ## Summary This PR extends nested-field regression coverage across Rust local/remote, Python sync/async, and Node so canonical escaped paths stay consistent across scalar, vector, and FTS index lifecycle behavior. It also aligns LanceDB's LabelList type gate with Lance by accepting `LargeList` columns while keeping `List>` unsupported until Lance defines stable membership semantics for struct labels. Part of #3406. --- nodejs/__test__/table.test.ts | 185 +++++++++++++++++++++++++-- python/python/tests/test_index.py | 99 ++++++++++++++- python/python/tests/test_table.py | 40 ++++++ rust/lancedb/src/remote/table.rs | 204 ++++++++++++++++++++++++++++++ rust/lancedb/src/table.rs | 126 +++++++++++++++++- rust/lancedb/src/utils/mod.rs | 4 +- 6 files changed, 638 insertions(+), 20 deletions(-) diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts index 0c9733c8f..b117d2c1d 100644 --- a/nodejs/__test__/table.test.ts +++ b/nodejs/__test__/table.test.ts @@ -911,10 +911,22 @@ describe("When creating an index", () => { expect(indices2.length).toBe(0); }); - it("should create and search a nested vector index", async () => { + it("should preserve canonical nested field paths across index lifecycle", async () => { const db = await connect(tmpDir.name); const nestedSchema = new Schema([ - new Field("id", new Int32(), true), + new Field("rowId", new Int32(), true), + new Field("row-id", new Int32(), true), + new Field("userId", new Int32(), true), + new Field( + "metadata", + new Struct([new Field("user_id", new Int32(), true)]), + true, + ), + new Field( + "MetaData", + new Struct([new Field("userId", new Int32(), true)]), + true, + ), new Field( "image", new Struct([ @@ -926,28 +938,147 @@ describe("When creating an index", () => { ]), true, ), + new Field( + "payload", + new Struct([new Field("text", new Utf8(), true)]), + true, + ), + new Field( + "meta-data", + new Struct([new Field("user-id", new Int32(), true)]), + true, + ), + new Field( + "literal", + new Struct([new Field("a.b", new Int32(), true)]), + true, + ), ]); const nestedTable = await db.createTable( - "nested_vector", + "nested_field_index_lifecycle", makeArrowTable( - Array.from({ length: 300 }, (_, id) => ({ - id, - image: { embedding: [id, id + 1] }, + Array.from({ length: 300 }, (_, rowId) => ({ + rowId, + "row-id": rowId, + userId: rowId, + metadata: { ["user_id"]: rowId }, + ["MetaData"]: { userId: rowId }, + image: { embedding: [rowId, rowId + 1] }, + payload: { text: `document ${rowId}` }, + "meta-data": { "user-id": rowId }, + literal: { "a.b": rowId }, })), { schema: nestedSchema }, ), ); + await nestedTable.createIndex("rowId", { + config: Index.btree(), + name: "row_id_idx", + }); + await nestedTable.createIndex("`row-id`", { + config: Index.btree(), + name: "row_dash_id_idx", + }); + await nestedTable.createIndex("userId", { + config: Index.btree(), + name: "top_user_id_idx", + }); + await nestedTable.createIndex("metadata.user_id", { + config: Index.btree(), + name: "nested_user_id_idx", + }); + await nestedTable.createIndex("MetaData.userId", { + config: Index.btree(), + name: "mixed_case_metadata_user_id_idx", + }); + await nestedTable.createIndex("`meta-data`.`user-id`", { + config: Index.btree(), + name: "escaped_names_idx", + }); + await nestedTable.createIndex("literal.`a.b`", { + config: Index.btree(), + name: "literal_dot_idx", + }); await nestedTable.createIndex("image.embedding", { name: "image_embedding_idx", }); - const indices = await nestedTable.listIndices(); - expect(indices).toContainEqual({ - name: "image_embedding_idx", - indexType: "IvfPq", - columns: ["image.embedding"], + await nestedTable.createIndex("payload.text", { + config: Index.fts({ withPosition: false }), + name: "payload_text_idx", }); + const indices = await nestedTable.listIndices(); + expect(indices).toEqual( + expect.arrayContaining([ + { + name: "row_id_idx", + indexType: "BTree", + columns: ["rowId"], + }, + { + name: "row_dash_id_idx", + indexType: "BTree", + columns: ["`row-id`"], + }, + { + name: "top_user_id_idx", + indexType: "BTree", + columns: ["userId"], + }, + { + name: "nested_user_id_idx", + indexType: "BTree", + columns: ["metadata.user_id"], + }, + { + name: "mixed_case_metadata_user_id_idx", + indexType: "BTree", + columns: ["MetaData.userId"], + }, + { + name: "escaped_names_idx", + indexType: "BTree", + columns: ["`meta-data`.`user-id`"], + }, + { + name: "literal_dot_idx", + indexType: "BTree", + columns: ["literal.`a.b`"], + }, + { + name: "image_embedding_idx", + indexType: "IvfPq", + columns: ["image.embedding"], + }, + { + name: "payload_text_idx", + indexType: "FTS", + columns: ["payload.text"], + }, + ]), + ); + + const stats = await nestedTable.indexStats( + "mixed_case_metadata_user_id_idx", + ); + expect(stats?.numIndexedRows).toEqual(300); + expect(stats?.indexType).toEqual("BTREE"); + + const filtered = await nestedTable + .query() + .where("MetaData.userId = 42") + .limit(1) + .toArray(); + expect(filtered[0].MetaData.userId).toEqual(42); + + const escapedFiltered = await nestedTable + .query() + .where("`row-id` = 43") + .limit(1) + .toArray(); + expect(escapedFiltered[0]["row-id"]).toEqual(43); + const explicit = await nestedTable .query() .nearestTo([0.0, 1.0]) @@ -959,7 +1090,37 @@ describe("When creating an index", () => { .nearestTo([0.0, 1.0]) .limit(1) .toArray(); - expect(inferred[0].id).toEqual(explicit[0].id); + expect(inferred[0].rowId).toEqual(explicit[0].rowId); + + await nestedTable.add([ + { + rowId: 300, + "row-id": 300, + userId: 300, + metadata: { ["user_id"]: 300 }, + ["MetaData"]: { userId: 300 }, + image: { embedding: [300.0, 301.0] }, + payload: { text: "document 300" }, + "meta-data": { "user-id": 300 }, + literal: { "a.b": 300 }, + }, + ]); + await nestedTable.optimize(); + const indicesAfterOptimize = await nestedTable.listIndices(); + expect(indicesAfterOptimize).toEqual( + expect.arrayContaining([ + { + name: "mixed_case_metadata_user_id_idx", + indexType: "BTree", + columns: ["MetaData.userId"], + }, + { + name: "image_embedding_idx", + indexType: "IvfPq", + columns: ["image.embedding"], + }, + ]), + ); }); it("should report multiple nested vector candidates", async () => { diff --git a/python/python/tests/test_index.py b/python/python/tests/test_index.py index cf342eb5c..4e8e4633a 100644 --- a/python/python/tests/test_index.py +++ b/python/python/tests/test_index.py @@ -113,8 +113,14 @@ async def test_create_nested_scalar_index_lists_canonical_paths(db_async): pa.field("user.id", pa.int32()), ] ) + mixed_case_metadata_type = pa.struct([pa.field("userId", pa.int32())]) + escaped_metadata_type = pa.struct([pa.field("user-id", pa.int32())]) + literal_type = pa.struct([pa.field("a.b", pa.int32())]) data = pa.Table.from_arrays( [ + pa.array([1, 2, 3], type=pa.int32()), + pa.array([1, 2, 3], type=pa.int32()), + pa.array([1, 2, 3], type=pa.int32()), pa.array([1, 2, 3], type=pa.int32()), pa.array( [ @@ -124,25 +130,67 @@ async def test_create_nested_scalar_index_lists_canonical_paths(db_async): ], type=metadata_type, ), + pa.array( + [{"userId": 10}, {"userId": 20}, {"userId": 30}], + type=mixed_case_metadata_type, + ), + pa.array( + [{"user-id": 10}, {"user-id": 20}, {"user-id": 30}], + type=escaped_metadata_type, + ), + pa.array( + [{"a.b": 10}, {"a.b": 20}, {"a.b": 30}], + type=literal_type, + ), + ], + names=[ + "rowId", + "row-id", + "userId", + "user_id", + "metadata", + "MetaData", + "meta-data", + "literal", ], - names=["user_id", "metadata"], ) table = await db_async.create_table("nested_scalar_index", data) - await table.create_index("user_id", config=BTree(), name="top_user_id_idx") + await table.create_index("rowId", config=BTree(), name="row_id_idx") + await table.create_index("`row-id`", config=BTree(), name="row_dash_id_idx") + await table.create_index("userId", config=BTree(), name="top_user_id_idx") + await table.create_index("user_id", config=BTree(), name="top_snake_user_id_idx") await table.create_index( "metadata.user_id", config=BTree(), name="nested_user_id_idx" ) await table.create_index( "metadata.`user.id`", config=BTree(), name="escaped_user_id_idx" ) + await table.create_index( + "MetaData.userId", config=BTree(), name="mixed_case_metadata_user_id_idx" + ) + await table.create_index( + "`meta-data`.`user-id`", config=BTree(), name="escaped_names_idx" + ) + await table.create_index("literal.`a.b`", config=BTree(), name="literal_dot_idx") columns_by_name = { index.name: index.columns for index in await table.list_indices() } - assert columns_by_name["top_user_id_idx"] == ["user_id"] + assert columns_by_name["row_id_idx"] == ["rowId"] + assert columns_by_name["row_dash_id_idx"] == ["`row-id`"] + assert columns_by_name["top_user_id_idx"] == ["userId"] + assert columns_by_name["top_snake_user_id_idx"] == ["user_id"] assert columns_by_name["nested_user_id_idx"] == ["metadata.user_id"] assert columns_by_name["escaped_user_id_idx"] == ["metadata.`user.id`"] + assert columns_by_name["mixed_case_metadata_user_id_idx"] == ["MetaData.userId"] + assert columns_by_name["escaped_names_idx"] == ["`meta-data`.`user-id`"] + assert columns_by_name["literal_dot_idx"] == ["literal.`a.b`"] + + for index_name in columns_by_name: + stats = await table.index_stats(index_name) + assert stats is not None + assert stats.num_indexed_rows == 3 @pytest.mark.asyncio @@ -189,6 +237,51 @@ async def test_create_label_list_index(some_table: AsyncTable): await some_table.create_index("tags", config=LabelList()) indices = await some_table.list_indices() assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]' + plan = await some_table.query().where("array_has(tags, 'tag0')").explain_plan() + assert "ScalarIndexQuery" in plan + + +@pytest.mark.asyncio +async def test_create_large_list_label_list_index(db_async): + data = pa.Table.from_pydict( + {"tags": [[f"tag{i % 2}", "shared"] for i in range(16)]}, + schema=pa.schema([pa.field("tags", pa.large_list(pa.string()))]), + ) + table = await db_async.create_table("large_list_label_list_index", data) + + await table.create_index("tags", config=LabelList()) + indices = await table.list_indices() + assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]' + plan = await table.query().where("array_has(tags, 'shared')").explain_plan() + assert "ScalarIndexQuery" in plan + + +@pytest.mark.asyncio +async def test_create_label_list_index_rejects_list_struct(db_async): + item_type = pa.struct( + [ + pa.field("tag", pa.string()), + pa.field( + "metadata", + pa.struct([pa.field("userId", pa.string())]), + ), + ] + ) + data = pa.Table.from_pylist( + [ + { + "items": [ + {"tag": "tag0", "metadata": {"userId": "user0"}}, + {"tag": "shared", "metadata": {"userId": "user1"}}, + ] + } + ], + schema=pa.schema([pa.field("items", pa.list_(item_type))]), + ) + table = await db_async.create_table("list_struct_label_list_index", data) + + with pytest.raises(Exception, match="LabelList index cannot be created"): + await table.create_index("items", config=LabelList()) @pytest.mark.asyncio diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py index 864730652..913220a1a 100644 --- a/python/python/tests/test_table.py +++ b/python/python/tests/test_table.py @@ -2399,18 +2399,32 @@ def test_create_scalar_index(mem_db: DBConnection): def test_create_index_nested_field_paths(mem_db: DBConnection): schema = pa.schema( [ + pa.field("rowId", pa.int32()), + pa.field("row-id", pa.int32()), + pa.field("userId", pa.int32()), pa.field("metadata", pa.struct([pa.field("user_id", pa.int32())])), + pa.field("MetaData", pa.struct([pa.field("userId", pa.int32())])), pa.field( "image", pa.struct([pa.field("embedding", pa.list_(pa.float32(), 2))]), ), + pa.field("payload", pa.struct([pa.field("text", pa.string())])), + pa.field("meta-data", pa.struct([pa.field("user-id", pa.int32())])), + pa.field("literal", pa.struct([pa.field("a.b", pa.int32())])), ] ) data = pa.Table.from_pylist( [ { + "rowId": i, + "row-id": i, + "userId": i, "metadata": {"user_id": i}, + "MetaData": {"userId": i}, "image": {"embedding": [float(i), float(i + 1)]}, + "payload": {"text": f"document {i}"}, + "meta-data": {"user-id": i}, + "literal": {"a.b": i}, } for i in range(256) ], @@ -2418,19 +2432,37 @@ def test_create_index_nested_field_paths(mem_db: DBConnection): ) table = mem_db.create_table("nested_index_paths", data=data) + table.create_scalar_index("rowId", name="row_id_idx") + table.create_scalar_index("`row-id`", name="row_dash_id_idx") + table.create_scalar_index("userId", name="top_user_id_idx") table.create_scalar_index("metadata.user_id", name="metadata_user_id_idx") + table.create_scalar_index("MetaData.userId", name="mixed_case_metadata_user_id_idx") + table.create_scalar_index("`meta-data`.`user-id`", name="escaped_names_idx") + table.create_scalar_index("literal.`a.b`", name="literal_dot_idx") table.create_index( vector_column_name="image.embedding", num_partitions=1, num_sub_vectors=1, name="image_embedding_idx", ) + table.create_fts_index("payload.text", with_position=False, name="payload_text_idx") indices = sorted(table.list_indices(), key=lambda idx: idx.name) assert [(idx.name, idx.index_type, idx.columns) for idx in indices] == [ + ("escaped_names_idx", "BTree", ["`meta-data`.`user-id`"]), ("image_embedding_idx", "IvfPq", ["image.embedding"]), + ("literal_dot_idx", "BTree", ["literal.`a.b`"]), ("metadata_user_id_idx", "BTree", ["metadata.user_id"]), + ("mixed_case_metadata_user_id_idx", "BTree", ["MetaData.userId"]), + ("payload_text_idx", "FTS", ["payload.text"]), + ("row_dash_id_idx", "BTree", ["`row-id`"]), + ("row_id_idx", "BTree", ["rowId"]), + ("top_user_id_idx", "BTree", ["userId"]), ] + for index in indices: + stats = table.index_stats(index.name) + assert stats is not None + assert stats.num_indexed_rows == 256 vector_results = ( table.search([0.0, 1.0], vector_column_name="image.embedding") @@ -2448,6 +2480,14 @@ def test_create_index_nested_field_paths(mem_db: DBConnection): assert len(filtered_results) == 1 assert filtered_results[0]["metadata"]["user_id"] == 42 + escaped_results = table.search().where("`row-id` = 43").limit(1).to_list() + assert len(escaped_results) == 1 + assert escaped_results[0]["row-id"] == 43 + + fts_results = table.search("document 44", query_type="fts").limit(1).to_list() + assert len(fts_results) == 1 + assert fts_results[0]["payload"]["text"] == "document 44" + def test_empty_query(mem_db: DBConnection): table = mem_db.create_table( diff --git a/rust/lancedb/src/remote/table.rs b/rust/lancedb/src/remote/table.rs index 1ccbcbb1d..9a38b9bb3 100644 --- a/rust/lancedb/src/remote/table.rs +++ b/rust/lancedb/src/remote/table.rs @@ -2616,11 +2616,19 @@ mod tests { let vector_type = DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 8); Schema::new(vec![ + Field::new("rowId", DataType::Int32, false), + Field::new("row-id", DataType::Int32, false), + Field::new("userId", DataType::Int32, false), Field::new( "metadata", DataType::Struct(vec![Field::new("user_id", DataType::Int32, false)].into()), false, ), + Field::new( + "MetaData", + DataType::Struct(vec![Field::new("userId", DataType::Int32, false)].into()), + false, + ), Field::new( "image", DataType::Struct(vec![Field::new("embedding", vector_type, false)].into()), @@ -3914,6 +3922,22 @@ mod tests { async fn test_create_index_nested_field_paths() { let schema = nested_index_schema(); let expected_requests = Arc::new(vec![ + json!({ + "column": "rowId", + "index_type": "BTREE", + }), + json!({ + "column": "`row-id`", + "index_type": "BTREE", + }), + json!({ + "column": "userId", + "index_type": "BTREE", + }), + json!({ + "column": "MetaData.userId", + "index_type": "BTREE", + }), json!({ "column": "metadata.user_id", "index_type": "BTREE", @@ -3969,6 +3993,26 @@ mod tests { } }); + table + .create_index(&["rowId"], Index::BTree(Default::default())) + .execute() + .await + .unwrap(); + table + .create_index(&["`ROW-ID`"], Index::BTree(Default::default())) + .execute() + .await + .unwrap(); + table + .create_index(&["userId"], Index::BTree(Default::default())) + .execute() + .await + .unwrap(); + table + .create_index(&["MetaData.userId"], Index::BTree(Default::default())) + .execute() + .await + .unwrap(); table .create_index(&["Metadata.USER_ID"], Index::BTree(Default::default())) .execute() @@ -4079,6 +4123,166 @@ mod tests { assert_eq!(indices, expected); } + #[tokio::test] + async fn test_list_indices_nested_field_paths() { + let schema = nested_index_schema(); + let table = Table::new_with_handler("my_table", move |request| { + assert_eq!(request.method(), "POST"); + + let response_body = match request.url().path() { + "/v1/table/my_table/describe/" => { + return http::Response::builder() + .status(200) + .body(describe_response(&schema)) + .unwrap(); + } + "/v1/table/my_table/index/list/" => { + serde_json::json!({ + "indexes": [ + { + "index_name": "row_id_idx", + "index_uuid": "00000000-0000-0000-0000-000000000001", + "columns": ["rowId"], + "index_status": "done", + }, + { + "index_name": "row_dash_id_idx", + "index_uuid": "00000000-0000-0000-0000-000000000002", + "columns": ["`ROW-ID`"], + "index_status": "done", + }, + { + "index_name": "user_id_idx", + "index_uuid": "00000000-0000-0000-0000-000000000003", + "columns": ["userId"], + "index_status": "done", + }, + { + "index_name": "mixed_case_metadata_user_id_idx", + "index_uuid": "00000000-0000-0000-0000-000000000004", + "columns": ["MetaData.userId"], + "index_status": "done", + }, + { + "index_name": "metadata_user_id_idx", + "index_uuid": "00000000-0000-0000-0000-000000000005", + "columns": ["Metadata.USER_ID"], + "index_status": "done", + }, + { + "index_name": "image_embedding_idx", + "index_uuid": "00000000-0000-0000-0000-000000000006", + "columns": ["Image.Embedding"], + "index_status": "done", + }, + { + "index_name": "payload_text_idx", + "index_uuid": "00000000-0000-0000-0000-000000000007", + "columns": ["Payload.Text"], + "index_status": "done", + }, + { + "index_name": "meta_data_user_id_idx", + "index_uuid": "00000000-0000-0000-0000-000000000008", + "columns": ["`META-DATA`.`USER-ID`"], + "index_status": "done", + }, + { + "index_name": "literal_dot_idx", + "index_uuid": "00000000-0000-0000-0000-000000000009", + "columns": ["literal.`A.B`"], + "index_status": "done", + }, + ] + }) + } + "/v1/table/my_table/index/row_id_idx/stats/" + | "/v1/table/my_table/index/row_dash_id_idx/stats/" + | "/v1/table/my_table/index/user_id_idx/stats/" + | "/v1/table/my_table/index/mixed_case_metadata_user_id_idx/stats/" + | "/v1/table/my_table/index/metadata_user_id_idx/stats/" + | "/v1/table/my_table/index/meta_data_user_id_idx/stats/" + | "/v1/table/my_table/index/literal_dot_idx/stats/" => { + serde_json::json!({ + "num_indexed_rows": 100000, + "num_unindexed_rows": 0, + "index_type": "BTREE" + }) + } + "/v1/table/my_table/index/image_embedding_idx/stats/" => { + serde_json::json!({ + "num_indexed_rows": 100000, + "num_unindexed_rows": 0, + "index_type": "IVF_PQ", + "distance_type": "l2" + }) + } + "/v1/table/my_table/index/payload_text_idx/stats/" => { + serde_json::json!({ + "num_indexed_rows": 100000, + "num_unindexed_rows": 0, + "index_type": "FTS" + }) + } + path => panic!("Unexpected path: {}", path), + }; + http::Response::builder() + .status(200) + .body(serde_json::to_string(&response_body).unwrap()) + .unwrap() + }); + + let indices = table.list_indices().await.unwrap(); + let expected = vec![ + IndexConfig { + name: "row_id_idx".into(), + index_type: IndexType::BTree, + columns: vec!["rowId".into()], + }, + IndexConfig { + name: "row_dash_id_idx".into(), + index_type: IndexType::BTree, + columns: vec!["`row-id`".into()], + }, + IndexConfig { + name: "user_id_idx".into(), + index_type: IndexType::BTree, + columns: vec!["userId".into()], + }, + IndexConfig { + name: "mixed_case_metadata_user_id_idx".into(), + index_type: IndexType::BTree, + columns: vec!["MetaData.userId".into()], + }, + IndexConfig { + name: "metadata_user_id_idx".into(), + index_type: IndexType::BTree, + columns: vec!["metadata.user_id".into()], + }, + IndexConfig { + name: "image_embedding_idx".into(), + index_type: IndexType::IvfPq, + columns: vec!["image.embedding".into()], + }, + IndexConfig { + name: "payload_text_idx".into(), + index_type: IndexType::FTS, + columns: vec!["payload.text".into()], + }, + IndexConfig { + name: "meta_data_user_id_idx".into(), + index_type: IndexType::BTree, + columns: vec!["`meta-data`.`user-id`".into()], + }, + IndexConfig { + name: "literal_dot_idx".into(), + index_type: IndexType::BTree, + columns: vec!["literal.`a.b`".into()], + }, + ]; + assert_eq!(indices, expected); + } + #[tokio::test] async fn test_list_versions() { let table = Table::new_with_handler("my_table", |request| { diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index 5f7b5eff8..0f5763bd3 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -3339,7 +3339,7 @@ mod tests { use arrow_array::{ Array, ArrayRef, BooleanArray, FixedSizeListArray, Int32Array, LargeStringArray, RecordBatch, RecordBatchIterator, RecordBatchReader, StringArray, StructArray, - builder::{ListBuilder, StringBuilder}, + builder::{LargeListBuilder, ListBuilder, StringBuilder}, }; use arrow_array::{BinaryArray, LargeBinaryArray}; use arrow_data::ArrayDataBuilder; @@ -4312,11 +4312,20 @@ mod tests { let num_rows = 512; let dimension = 8; + let row_id = Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef; + let row_dash_id = Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef; + let top_user_id = Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef; + let metadata = Arc::new(StructArray::from(vec![( Arc::new(Field::new("user_id", DataType::Int32, false)), Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef, )])); + let mixed_case_metadata = Arc::new(StructArray::from(vec![( + Arc::new(Field::new("userId", DataType::Int32, false)), + Arc::new(Int32Array::from_iter_values(0..num_rows)) as ArrayRef, + )])); + let vector_values = arrow_array::Float32Array::from_iter_values( (0..num_rows * dimension).map(|v| v as f32), ); @@ -4349,15 +4358,31 @@ mod tests { )])); let schema = Arc::new(Schema::new(vec![ + Field::new("rowId", DataType::Int32, false), + Field::new("row-id", DataType::Int32, false), + Field::new("userId", DataType::Int32, false), Field::new("metadata", metadata.data_type().clone(), false), + Field::new("MetaData", mixed_case_metadata.data_type().clone(), false), Field::new("image", image.data_type().clone(), false), Field::new("payload", payload.data_type().clone(), false), Field::new("meta-data", meta_data.data_type().clone(), false), Field::new("literal", literal.data_type().clone(), false), ])); - let batch = - RecordBatch::try_new(schema, vec![metadata, image, payload, meta_data, literal]) - .unwrap(); + let batch = RecordBatch::try_new( + schema, + vec![ + row_id, + row_dash_id, + top_user_id, + metadata, + mixed_case_metadata, + image, + payload, + meta_data, + literal, + ], + ) + .unwrap(); let table = conn .create_table("nested_index_paths", batch) @@ -4374,6 +4399,33 @@ mod tests { .execute() .await .unwrap(); + table + .create_index(&["rowId"], Index::BTree(BTreeIndexBuilder::default())) + .name("row_id_idx".to_string()) + .execute() + .await + .unwrap(); + table + .create_index(&["`row-id`"], Index::BTree(BTreeIndexBuilder::default())) + .name("row_dash_id_idx".to_string()) + .execute() + .await + .unwrap(); + table + .create_index(&["userId"], Index::BTree(BTreeIndexBuilder::default())) + .name("top_user_id_idx".to_string()) + .execute() + .await + .unwrap(); + table + .create_index( + &["MetaData.userId"], + Index::BTree(BTreeIndexBuilder::default()), + ) + .name("mixed_case_metadata_user_id_idx".to_string()) + .execute() + .await + .unwrap(); table .create_index(&["image.embedding"], Index::Auto) .name("image_embedding_idx".to_string()) @@ -4441,11 +4493,31 @@ mod tests { &["metadata.user_id".to_string()][..], crate::index::IndexType::BTree, ), + ( + "mixed_case_metadata_user_id_idx", + &["MetaData.userId".to_string()][..], + crate::index::IndexType::BTree, + ), ( "payload_text_idx", &["payload.text".to_string()][..], crate::index::IndexType::FTS, ), + ( + "row_dash_id_idx", + &["`row-id`".to_string()][..], + crate::index::IndexType::BTree, + ), + ( + "row_id_idx", + &["rowId".to_string()][..], + crate::index::IndexType::BTree, + ), + ( + "top_user_id_idx", + &["userId".to_string()][..], + crate::index::IndexType::BTree, + ), ] ); @@ -4695,6 +4767,52 @@ mod tests { assert_eq!(index.columns, vec!["tags".to_string()]); } + #[tokio::test] + async fn test_create_label_list_index_on_large_list() { + let tmp_dir = tempdir().unwrap(); + let uri = tmp_dir.path().to_str().unwrap(); + + let conn = ConnectBuilder::new(uri).execute().await.unwrap(); + + let schema = Arc::new(Schema::new(vec![Field::new( + "tags", + DataType::LargeList(Field::new("item", DataType::Utf8, true).into()), + true, + )])); + + const TAGS: [&str; 3] = ["cat", "dog", "fish"]; + + let values_builder = StringBuilder::new(); + let mut builder = LargeListBuilder::new(values_builder); + for i in 0..120 { + builder.values().append_value(TAGS[i % 3]); + if i % 3 == 0 { + builder.append(true) + } + } + let tags = Arc::new(builder.finish()); + + let batch = RecordBatch::try_new(schema, vec![tags]).unwrap(); + + let table = conn + .create_table("test_large_list_label_list", batch) + .execute() + .await + .unwrap(); + + table + .create_index(&["tags"], Index::LabelList(Default::default())) + .execute() + .await + .unwrap(); + + let index_configs = table.list_indices().await.unwrap(); + assert_eq!(index_configs.len(), 1); + let index = index_configs.into_iter().next().unwrap(); + assert_eq!(index.index_type, crate::index::IndexType::LabelList); + assert_eq!(index.columns, vec!["tags".to_string()]); + } + #[tokio::test] async fn test_create_inverted_index() { let tmp_dir = tempdir().unwrap(); diff --git a/rust/lancedb/src/utils/mod.rs b/rust/lancedb/src/utils/mod.rs index c0823bfd3..b435f4a0e 100644 --- a/rust/lancedb/src/utils/mod.rs +++ b/rust/lancedb/src/utils/mod.rs @@ -257,7 +257,9 @@ pub fn supported_bitmap_data_type(dtype: &DataType) -> bool { pub fn supported_label_list_data_type(dtype: &DataType) -> bool { match dtype { - DataType::List(field) => supported_bitmap_data_type(field.data_type()), + DataType::List(field) | DataType::LargeList(field) => { + supported_bitmap_data_type(field.data_type()) + } DataType::FixedSizeList(field, _) => supported_bitmap_data_type(field.data_type()), _ => false, }