fix: fix index and tag filtering for flat format (#7121)

* perf: only decode primary keys in the batch

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: don't push none to creator

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: implement method to filter __table_id for sparse encoding

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: filter table id for sparse encoding separately

The __table_id doesn't present in projection so we have to filter it
manually

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: decode tags for sparse encoding when building bloom filter

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: support inverted index for tags under sparse encoding

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: skip tag columns in fulltext index

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: fix warnings

Signed-off-by: evenyag <realevenyag@gmail.com>

* style: fix clippy

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: fix list index metadata test

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: decode primary key columns to filter

When primary key columns are not in projection but in filters, we need
to decode them in compute_filter_mask_flat

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: reuse filter method

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: only use dictionary for string type in compat

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: safe to get column by creator's column id

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
This commit is contained in:
Yingwen
2025-10-23 14:43:46 +08:00
committed by GitHub
parent 136b9eef7a
commit f388dbdbb8
11 changed files with 482 additions and 251 deletions

View File

@@ -972,17 +972,17 @@ async fn test_list_ssts_with_format(
#[tokio::test]
async fn test_all_index_metas_list_all_types() {
test_all_index_metas_list_all_types_with_format(false, r#"
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6032), index_type: "bloom_filter", target_type: "column", target_key: "3", target_json: "{\"column\":3}", blob_size: 751, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":640,\"row_count\":20,\"rows_per_segment\":2,\"segment_count\":10}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6032), index_type: "fulltext_bloom", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 87, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":64,\"row_count\":20,\"rows_per_segment\":4,\"segment_count\":5},\"fulltext\":{\"analyzer\":\"English\",\"case_sensitive\":false}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6032), index_type: "fulltext_tantivy", target_type: "column", target_key: "2", target_json: "{\"column\":2}", blob_size: 1104, meta_json: Some("{\"fulltext\":{\"analyzer\":\"Chinese\",\"case_sensitive\":true}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6032), index_type: "inverted", target_type: "column", target_key: "0", target_json: "{\"column\":0}", blob_size: 70, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":44,\"inverted_index_size\":70,\"null_bitmap_size\":8,\"relative_fst_offset\":26,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6032), index_type: "inverted", target_type: "column", target_key: "4", target_json: "{\"column\":4}", blob_size: 515, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":147,\"inverted_index_size\":515,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }"#).await;
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "bloom_filter", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 751, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":640,\"row_count\":20,\"rows_per_segment\":2,\"segment_count\":10}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "fulltext_bloom", target_type: "column", target_key: "4", target_json: "{\"column\":4}", blob_size: 89, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":64,\"row_count\":20,\"rows_per_segment\":4,\"segment_count\":5},\"fulltext\":{\"analyzer\":\"English\",\"case_sensitive\":false}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "fulltext_tantivy", target_type: "column", target_key: "5", target_json: "{\"column\":5}", blob_size: 1100, meta_json: Some("{\"fulltext\":{\"analyzer\":\"Chinese\",\"case_sensitive\":true}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "inverted", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 518, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":150,\"inverted_index_size\":518,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "inverted", target_type: "column", target_key: "2", target_json: "{\"column\":2}", blob_size: 515, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":147,\"inverted_index_size\":515,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }"#).await;
test_all_index_metas_list_all_types_with_format(true, r#"
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6144), index_type: "bloom_filter", target_type: "column", target_key: "3", target_json: "{\"column\":3}", blob_size: 751, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":640,\"row_count\":20,\"rows_per_segment\":2,\"segment_count\":10}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6144), index_type: "fulltext_bloom", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 89, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":64,\"row_count\":20,\"rows_per_segment\":4,\"segment_count\":5},\"fulltext\":{\"analyzer\":\"English\",\"case_sensitive\":false}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6144), index_type: "fulltext_tantivy", target_type: "column", target_key: "2", target_json: "{\"column\":2}", blob_size: 1104, meta_json: Some("{\"fulltext\":{\"analyzer\":\"Chinese\",\"case_sensitive\":true}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6144), index_type: "inverted", target_type: "column", target_key: "0", target_json: "{\"column\":0}", blob_size: 92, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":66,\"inverted_index_size\":92,\"null_bitmap_size\":8,\"relative_fst_offset\":26,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6144), index_type: "inverted", target_type: "column", target_key: "4", target_json: "{\"column\":4}", blob_size: 515, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":147,\"inverted_index_size\":515,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }"#).await;
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "bloom_filter", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 751, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":640,\"row_count\":20,\"rows_per_segment\":2,\"segment_count\":10}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "fulltext_bloom", target_type: "column", target_key: "4", target_json: "{\"column\":4}", blob_size: 89, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":64,\"row_count\":20,\"rows_per_segment\":4,\"segment_count\":5},\"fulltext\":{\"analyzer\":\"English\",\"case_sensitive\":false}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "fulltext_tantivy", target_type: "column", target_key: "5", target_json: "{\"column\":5}", blob_size: 1100, meta_json: Some("{\"fulltext\":{\"analyzer\":\"Chinese\",\"case_sensitive\":true}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "inverted", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 518, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":150,\"inverted_index_size\":518,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "inverted", target_type: "column", target_key: "2", target_json: "{\"column\":2}", blob_size: 515, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":147,\"inverted_index_size\":515,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }"#).await;
}
async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expect_format: &str) {
@@ -1001,12 +1001,33 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
// One region with both fulltext backends and inverted index enabled, plus bloom skipping index
let region_id = RegionId::new(11, 1);
let mut request = CreateRequestBuilder::new().tag_num(3).field_num(2).build();
// inverted index on tag_0
request.column_metadatas[0]
let mut request = CreateRequestBuilder::new().tag_num(1).field_num(2).build();
// bloom filter skipping index on field_1
let skipping = SkippingIndexOptions::new_unchecked(2, 0.01, SkippingIndexType::BloomFilter);
request.column_metadatas[1]
.column_schema
.set_skipping_options(&skipping)
.unwrap();
// inverted index on field_1
request.column_metadatas[2]
.column_schema
.set_inverted_index(true);
// fulltext bloom on tag_1
// inverted index on tag_0
request.column_metadatas[1]
.column_schema
.set_inverted_index(true);
request.column_metadatas.push(ColumnMetadata {
column_schema: ColumnSchema::new(
"field_2".to_string(),
ConcreteDataType::string_datatype(),
true,
),
semantic_type: SemanticType::Field,
column_id: 4,
});
// fulltext bloom on field_2
let ft_bloom = FulltextOptions::new_unchecked(
true,
FulltextAnalyzer::English,
@@ -1015,11 +1036,24 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
4,
0.001,
);
request.column_metadatas[1]
request
.column_metadatas
.last_mut()
.unwrap()
.column_schema
.set_fulltext_options(&ft_bloom)
.unwrap();
// fulltext tantivy on tag_2
request.column_metadatas.push(ColumnMetadata {
column_schema: ColumnSchema::new(
"field_3".to_string(),
ConcreteDataType::string_datatype(),
true,
),
semantic_type: SemanticType::Field,
column_id: 5,
});
// fulltext tantivy on field_3
let ft_tantivy = FulltextOptions::new_unchecked(
true,
FulltextAnalyzer::Chinese,
@@ -1028,28 +1062,20 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
2,
0.01,
);
request.column_metadatas[2]
request
.column_metadatas
.last_mut()
.unwrap()
.column_schema
.set_fulltext_options(&ft_tantivy)
.unwrap();
// bloom filter skipping index on field_1 (which is at index 3)
let skipping = SkippingIndexOptions::new_unchecked(2, 0.01, SkippingIndexType::BloomFilter);
request.column_metadatas[3]
.column_schema
.set_skipping_options(&skipping)
.unwrap();
// inverted index on field_1
request.column_metadatas[4]
.column_schema
.set_inverted_index(true);
engine
.handle_request(region_id, RegionRequest::Create(request.clone()))
.await
.unwrap();
// write some rows (schema: tag_0, tag_1, tag_2, field_0, field_1, ts)
// write some rows (schema: tag_0, field_0, field_1, field_2, field_3, ts)
let column_schemas = rows_schema(&request);
let rows_vec: Vec<api::v1::Row> = (0..20)
.map(|ts| api::v1::Row {
@@ -1057,12 +1083,6 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
api::v1::Value {
value_data: Some(api::v1::value::ValueData::StringValue("x".to_string())),
},
api::v1::Value {
value_data: Some(api::v1::value::ValueData::StringValue("y".to_string())),
},
api::v1::Value {
value_data: Some(api::v1::value::ValueData::StringValue("z".to_string())),
},
api::v1::Value {
value_data: Some(api::v1::value::ValueData::F64Value(ts as f64)),
},
@@ -1074,6 +1094,12 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
ts as i64 * 1000,
)),
},
api::v1::Value {
value_data: Some(api::v1::value::ValueData::StringValue("y".to_string())),
},
api::v1::Value {
value_data: Some(api::v1::value::ValueData::StringValue("z".to_string())),
},
],
})
.collect();
@@ -1095,7 +1121,7 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
.unwrap();
fn bucket_size(size: u64) -> u64 {
if size < 512 { size } else { (size / 16) * 16 }
if size < 512 { size } else { (size / 100) * 100 }
}
let mut metas = engine.all_index_metas().await;
@@ -1125,5 +1151,5 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
.map(|entry| format!("\n{:?}", entry))
.collect::<String>();
assert_eq!(debug_format, expect_format);
assert_eq!(expect_format, debug_format);
}

View File

@@ -13,12 +13,10 @@
// limitations under the License.
use std::collections::VecDeque;
use std::ops::BitAnd;
use std::sync::Arc;
use bytes::Bytes;
use datatypes::arrow::array::BooleanArray;
use datatypes::arrow::buffer::BooleanBuffer;
use datatypes::arrow::record_batch::RecordBatch;
use parquet::arrow::ProjectionMask;
use parquet::arrow::arrow_reader::ParquetRecordBatchReader;
@@ -30,7 +28,7 @@ use crate::error::{self, ComputeArrowSnafu, DecodeArrowRowGroupSnafu};
use crate::memtable::bulk::context::{BulkIterContext, BulkIterContextRef};
use crate::memtable::bulk::row_group_reader::MemtableRowGroupReaderBuilder;
use crate::sst::parquet::flat_format::sequence_column_index;
use crate::sst::parquet::reader::{MaybeFilter, RowGroupReaderContext};
use crate::sst::parquet::reader::RowGroupReaderContext;
/// Iterator for reading data inside a bulk part.
pub struct EncodedBulkPartIter {
@@ -191,38 +189,13 @@ fn apply_combined_filters(
let num_rows = record_batch.num_rows();
let mut combined_filter = None;
// First, apply predicate filters.
// First, apply predicate filters using the shared method.
if !context.base.filters.is_empty() {
let num_rows = record_batch.num_rows();
let mut mask = BooleanBuffer::new_set(num_rows);
// Run filter one by one and combine them result, similar to RangeBase::precise_filter
for filter_ctx in &context.base.filters {
let filter = match filter_ctx.filter() {
MaybeFilter::Filter(f) => f,
// Column matches.
MaybeFilter::Matched => continue,
// Column doesn't match, filter the entire batch.
MaybeFilter::Pruned => return Ok(None),
};
// Safety: We checked the format type in new().
let Some(column_index) = context
.read_format()
.as_flat()
.unwrap()
.projected_index_by_id(filter_ctx.column_id())
else {
continue;
};
let array = record_batch.column(column_index);
let result = filter
.evaluate_array(array)
.context(crate::error::RecordBatchSnafu)?;
mask = mask.bitand(&result);
}
// Convert the mask to BooleanArray
let predicate_mask = context.base.compute_filter_mask_flat(&record_batch)?;
// If predicate filters out the entire batch, return None early
let Some(mask) = predicate_mask else {
return Ok(None);
};
combined_filter = Some(BooleanArray::from(mask));
}

View File

@@ -386,7 +386,8 @@ impl FlatCompatBatch {
/// Repeats the vector value `to_len` times.
fn repeat_vector(vector: &VectorRef, to_len: usize, is_tag: bool) -> Result<ArrayRef> {
assert_eq!(1, vector.len());
if is_tag {
let data_type = vector.data_type();
if is_tag && data_type.is_string() {
let values = vector.to_arrow_array();
if values.is_null(0) {
// Creates a dictionary array with `to_len` null keys.

View File

@@ -48,6 +48,8 @@ pub struct FlatProjectionMapper {
/// Ids of columns to project. It keeps ids in the same order as the `projection`
/// indices to build the mapper.
/// The mapper won't deduplicate the column ids.
///
/// Note that this doesn't contain the `__table_id` and `__tsid`.
column_ids: Vec<ColumnId>,
/// Ids and DataTypes of columns of the expected batch.
/// We can use this to check if the batch is compatible with the expected schema.

View File

@@ -26,10 +26,13 @@ use std::sync::Arc;
use bloom_filter::creator::BloomFilterIndexer;
use common_telemetry::{debug, info, warn};
use datatypes::arrow::array::BinaryArray;
use datatypes::arrow::record_batch::RecordBatch;
use mito_codec::index::IndexValuesCodec;
use mito_codec::row_converter::CompositeValues;
use puffin_manager::SstPuffinManager;
use smallvec::{SmallVec, smallvec};
use snafu::ResultExt;
use snafu::{OptionExt, ResultExt};
use statistics::{ByteCount, RowCount};
use store_api::metadata::RegionMetadataRef;
use store_api::storage::{ColumnId, FileId, RegionId};
@@ -40,7 +43,7 @@ use crate::access_layer::{AccessLayerRef, FilePathProvider, OperationType, Regio
use crate::cache::file_cache::{FileType, IndexKey};
use crate::cache::write_cache::{UploadTracker, WriteCacheRef};
use crate::config::{BloomFilterConfig, FulltextIndexConfig, InvertedIndexConfig};
use crate::error::{BuildIndexAsyncSnafu, Error, Result};
use crate::error::{BuildIndexAsyncSnafu, DecodeSnafu, Error, InvalidRecordBatchSnafu, Result};
use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
use crate::metrics::INDEX_CREATE_MEMORY_USAGE;
use crate::read::{Batch, BatchReader};
@@ -57,6 +60,8 @@ use crate::sst::index::fulltext_index::creator::FulltextIndexer;
use crate::sst::index::intermediate::IntermediateManager;
use crate::sst::index::inverted_index::creator::InvertedIndexer;
use crate::sst::parquet::SstInfo;
use crate::sst::parquet::flat_format::primary_key_column_index;
use crate::sst::parquet::format::PrimaryKeyArray;
pub(crate) const TYPE_INVERTED_INDEX: &str = "inverted_index";
pub(crate) const TYPE_FULLTEXT_INDEX: &str = "fulltext_index";
@@ -698,6 +703,56 @@ impl IndexBuildScheduler {
}
}
/// Decodes primary keys from a flat format RecordBatch.
/// Returns a list of (decoded_pk_value, count) tuples where count is the number of occurrences.
pub(crate) fn decode_primary_keys_with_counts(
batch: &RecordBatch,
codec: &IndexValuesCodec,
) -> Result<Vec<(CompositeValues, usize)>> {
let primary_key_index = primary_key_column_index(batch.num_columns());
let pk_dict_array = batch
.column(primary_key_index)
.as_any()
.downcast_ref::<PrimaryKeyArray>()
.context(InvalidRecordBatchSnafu {
reason: "Primary key column is not a dictionary array",
})?;
let pk_values_array = pk_dict_array
.values()
.as_any()
.downcast_ref::<BinaryArray>()
.context(InvalidRecordBatchSnafu {
reason: "Primary key values are not binary array",
})?;
let keys = pk_dict_array.keys();
// Decodes primary keys and count consecutive occurrences
let mut result: Vec<(CompositeValues, usize)> = Vec::new();
let mut prev_key: Option<u32> = None;
for i in 0..keys.len() {
let current_key = keys.value(i);
// Checks if current key is the same as previous key
if let Some(prev) = prev_key
&& prev == current_key
{
// Safety: We already have a key in the result vector.
result.last_mut().unwrap().1 += 1;
continue;
}
// New key, decodes it.
let pk_bytes = pk_values_array.value(current_key as usize);
let decoded_value = codec.decoder().decode(pk_bytes).context(DecodeSnafu)?;
result.push((decoded_value, 1));
prev_key = Some(current_key);
}
Ok(result)
}
#[cfg(test)]
mod tests {
use std::sync::Arc;

View File

@@ -16,6 +16,7 @@ use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::AtomicUsize;
use api::v1::SemanticType;
use common_telemetry::{debug, warn};
use datatypes::arrow::record_batch::RecordBatch;
use datatypes::schema::SkippingIndexType;
@@ -23,9 +24,10 @@ use datatypes::vectors::Helper;
use index::bloom_filter::creator::BloomFilterCreator;
use index::target::IndexTarget;
use mito_codec::index::{IndexValueCodec, IndexValuesCodec};
use mito_codec::row_converter::SortField;
use mito_codec::row_converter::{CompositeValues, SortField};
use puffin::puffin_manager::{PuffinWriter, PutOptions};
use snafu::{ResultExt, ensure};
use store_api::codec::PrimaryKeyEncoding;
use store_api::metadata::RegionMetadataRef;
use store_api::storage::{ColumnId, FileId};
use tokio_util::compat::{TokioAsyncReadCompatExt, TokioAsyncWriteCompatExt};
@@ -35,13 +37,13 @@ use crate::error::{
OperateAbortedIndexSnafu, PuffinAddBlobSnafu, PushBloomFilterValueSnafu, Result,
};
use crate::read::Batch;
use crate::sst::index::TYPE_BLOOM_FILTER_INDEX;
use crate::sst::index::bloom_filter::INDEX_BLOB_TYPE;
use crate::sst::index::intermediate::{
IntermediateLocation, IntermediateManager, TempFileProvider,
};
use crate::sst::index::puffin_manager::SstPuffinWriter;
use crate::sst::index::statistics::{ByteCount, RowCount, Statistics};
use crate::sst::index::{TYPE_BLOOM_FILTER_INDEX, decode_primary_keys_with_counts};
/// The buffer size for the pipe used to send index data to the puffin blob.
const PIPE_BUFFER_SIZE_FOR_SENDING_BLOB: usize = 8192;
@@ -289,47 +291,81 @@ impl BloomFilterIndexer {
let n = batch.num_rows();
guard.inc_row_count(n);
let is_sparse = self.metadata.primary_key_encoding == PrimaryKeyEncoding::Sparse;
let mut decoded_pks: Option<Vec<(CompositeValues, usize)>> = None;
for (col_id, creator) in &mut self.creators {
// Get the column name from metadata
if let Some(column_meta) = self.metadata.column_by_id(*col_id) {
let column_name = &column_meta.column_schema.name;
// Safety: `creators` are created from the metadata so it won't be None.
let column_meta = self.metadata.column_by_id(*col_id).unwrap();
let column_name = &column_meta.column_schema.name;
if let Some(column_array) = batch.column_by_name(column_name) {
// Convert Arrow array to VectorRef
let vector = Helper::try_into_vector(column_array.clone())
.context(crate::error::ConvertVectorSnafu)?;
let sort_field = SortField::new(vector.data_type());
// Find the column in the RecordBatch by name
if let Some(column_array) = batch.column_by_name(column_name) {
// Convert Arrow array to VectorRef
let vector = Helper::try_into_vector(column_array.clone())
.context(crate::error::ConvertVectorSnafu)?;
let sort_field = SortField::new(vector.data_type());
for i in 0..n {
let value = vector.get_ref(i);
let elems = (!value.is_null())
.then(|| {
let mut buf = vec![];
IndexValueCodec::encode_nonnull_value(value, &sort_field, &mut buf)
.context(EncodeSnafu)?;
Ok(buf)
})
.transpose()?;
for i in 0..n {
let value = vector.get_ref(i);
let elems = (!value.is_null())
.then(|| {
let mut buf = vec![];
IndexValueCodec::encode_nonnull_value(value, &sort_field, &mut buf)
.context(EncodeSnafu)?;
Ok(buf)
})
.transpose()?;
creator
.push_row_elems(elems)
.await
.context(PushBloomFilterValueSnafu)?;
}
} else if is_sparse && column_meta.semantic_type == SemanticType::Tag {
// Column not found in batch, tries to decode from primary keys for sparse encoding.
if decoded_pks.is_none() {
decoded_pks = Some(decode_primary_keys_with_counts(batch, &self.codec)?);
}
creator
.push_row_elems(elems)
.await
.context(PushBloomFilterValueSnafu)?;
}
} else {
let pk_values_with_counts = decoded_pks.as_ref().unwrap();
let Some(col_info) = self.codec.pk_col_info(*col_id) else {
debug!(
"Column {} not found in the batch during building bloom filter index",
"Column {} not found in primary key during building bloom filter index",
column_name
);
// Push empty elements to maintain alignment
for _ in 0..n {
creator
.push_row_elems(None)
.await
.context(PushBloomFilterValueSnafu)?;
}
continue;
};
let pk_index = col_info.idx;
let field = &col_info.field;
for (decoded, count) in pk_values_with_counts {
let value = match decoded {
CompositeValues::Dense(dense) => dense.get(pk_index).map(|v| &v.1),
CompositeValues::Sparse(sparse) => sparse.get(col_id),
};
let elems = value
.filter(|v| !v.is_null())
.map(|v| {
let mut buf = vec![];
IndexValueCodec::encode_nonnull_value(
v.as_value_ref(),
field,
&mut buf,
)
.context(EncodeSnafu)?;
Ok(buf)
})
.transpose()?;
creator
.push_n_row_elems(*count, elems)
.await
.context(PushBloomFilterValueSnafu)?;
}
} else {
debug!(
"Column {} not found in the batch during building bloom filter index",
column_name
);
}
}

View File

@@ -16,6 +16,7 @@ use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::AtomicUsize;
use api::v1::SemanticType;
use common_telemetry::warn;
use datatypes::arrow::array::{Array, LargeStringArray, StringArray};
use datatypes::arrow::datatypes::DataType;
@@ -69,6 +70,17 @@ impl FulltextIndexer {
let mut creators = HashMap::new();
for column in &metadata.column_metadatas {
// Tag columns don't support fulltext index now.
// If we need to support fulltext index for tag columns, we also need to parse
// the codec and handle sparse encoding for flat format specially.
if column.semantic_type == SemanticType::Tag {
common_telemetry::debug!(
"Skip creating fulltext index for tag column {}",
column.column_schema.name
);
continue;
}
let options = column
.column_schema
.fulltext_options()

View File

@@ -17,6 +17,7 @@ use std::num::NonZeroUsize;
use std::sync::Arc;
use std::sync::atomic::AtomicUsize;
use api::v1::SemanticType;
use common_telemetry::{debug, warn};
use datatypes::arrow::record_batch::RecordBatch;
use datatypes::vectors::Helper;
@@ -26,9 +27,10 @@ use index::inverted_index::create::sort_create::SortIndexCreator;
use index::inverted_index::format::writer::InvertedIndexBlobWriter;
use index::target::IndexTarget;
use mito_codec::index::{IndexValueCodec, IndexValuesCodec};
use mito_codec::row_converter::SortField;
use mito_codec::row_converter::{CompositeValues, SortField};
use puffin::puffin_manager::{PuffinWriter, PutOptions};
use snafu::{ResultExt, ensure};
use store_api::codec::PrimaryKeyEncoding;
use store_api::metadata::RegionMetadataRef;
use store_api::storage::{ColumnId, FileId};
use tokio::io::duplex;
@@ -39,13 +41,13 @@ use crate::error::{
PushIndexValueSnafu, Result,
};
use crate::read::Batch;
use crate::sst::index::TYPE_INVERTED_INDEX;
use crate::sst::index::intermediate::{
IntermediateLocation, IntermediateManager, TempFileProvider,
};
use crate::sst::index::inverted_index::INDEX_BLOB_TYPE;
use crate::sst::index::puffin_manager::SstPuffinWriter;
use crate::sst::index::statistics::{ByteCount, RowCount, Statistics};
use crate::sst::index::{TYPE_INVERTED_INDEX, decode_primary_keys_with_counts};
/// The minimum memory usage threshold for one column.
const MIN_MEMORY_USAGE_THRESHOLD_PER_COLUMN: usize = 1024 * 1024; // 1MB
@@ -78,9 +80,6 @@ pub struct InvertedIndexer {
/// Region metadata for column lookups.
metadata: RegionMetadataRef,
/// Cache for mapping indexed column positions to their indices in the RecordBatch.
/// Aligns with indexed_column_ids. Initialized lazily when first batch is processed.
column_index_cache: Option<Vec<Option<usize>>>,
}
impl InvertedIndexer {
@@ -130,7 +129,6 @@ impl InvertedIndexer {
memory_usage,
indexed_column_ids,
metadata: metadata.clone(),
column_index_cache: None,
}
}
@@ -170,29 +168,29 @@ impl InvertedIndexer {
}
async fn do_update_flat(&mut self, batch: &RecordBatch) -> Result<()> {
// Initialize column index cache if not already done
if self.column_index_cache.is_none() {
self.initialize_column_index_cache(batch);
}
let mut guard = self.stats.record_update();
let n = batch.num_rows();
guard.inc_row_count(n);
guard.inc_row_count(batch.num_rows());
let column_indices = self.column_index_cache.as_ref().unwrap();
let is_sparse = self.metadata.primary_key_encoding == PrimaryKeyEncoding::Sparse;
let mut decoded_pks: Option<Vec<(CompositeValues, usize)>> = None;
for ((col_id, target_key), &column_index) in
self.indexed_column_ids.iter().zip(column_indices.iter())
{
if let Some(index) = column_index {
let column_array = batch.column(index);
for (col_id, target_key) in &self.indexed_column_ids {
let Some(column_meta) = self.metadata.column_by_id(*col_id) else {
debug!(
"Column {} not found in the metadata during building inverted index",
col_id
);
continue;
};
let column_name = &column_meta.column_schema.name;
if let Some(column_array) = batch.column_by_name(column_name) {
// Convert Arrow array to VectorRef using Helper
let vector = Helper::try_into_vector(column_array.clone())
.context(crate::error::ConvertVectorSnafu)?;
let sort_field = SortField::new(vector.data_type());
for row in 0..n {
for row in 0..batch.num_rows() {
self.value_buf.clear();
let value_ref = vector.get_ref(row);
@@ -214,6 +212,47 @@ impl InvertedIndexer {
.context(PushIndexValueSnafu)?;
}
}
} else if is_sparse && column_meta.semantic_type == SemanticType::Tag {
// Column not found in batch, tries to decode from primary keys for sparse encoding.
if decoded_pks.is_none() {
decoded_pks = Some(decode_primary_keys_with_counts(batch, &self.codec)?);
}
let pk_values_with_counts = decoded_pks.as_ref().unwrap();
let Some(col_info) = self.codec.pk_col_info(*col_id) else {
debug!(
"Column {} not found in primary key during building bloom filter index",
column_name
);
continue;
};
let pk_index = col_info.idx;
let field = &col_info.field;
for (decoded, count) in pk_values_with_counts {
let value = match decoded {
CompositeValues::Dense(dense) => dense.get(pk_index).map(|v| &v.1),
CompositeValues::Sparse(sparse) => sparse.get(col_id),
};
let elem = value
.filter(|v| !v.is_null())
.map(|v| {
self.value_buf.clear();
IndexValueCodec::encode_nonnull_value(
v.as_value_ref(),
field,
&mut self.value_buf,
)
.context(EncodeSnafu)?;
Ok(self.value_buf.as_slice())
})
.transpose()?;
self.index_creator
.push_with_name_n(target_key, elem, *count)
.await
.context(PushIndexValueSnafu)?;
}
} else {
debug!(
"Column {} not found in the batch during building inverted index",
@@ -225,26 +264,6 @@ impl InvertedIndexer {
Ok(())
}
/// Initializes the column index cache by mapping indexed column ids to their positions in the RecordBatch.
fn initialize_column_index_cache(&mut self, batch: &RecordBatch) {
let mut column_indices = Vec::with_capacity(self.indexed_column_ids.len());
for (col_id, _) in &self.indexed_column_ids {
let column_index = if let Some(column_meta) = self.metadata.column_by_id(*col_id) {
let column_name = &column_meta.column_schema.name;
batch
.schema()
.column_with_name(column_name)
.map(|(index, _)| index)
} else {
None
};
column_indices.push(column_index);
}
self.column_index_cache = Some(column_indices);
}
/// Finishes index creation and cleans up garbage.
/// Returns the number of rows and bytes written.
pub async fn finish(

View File

@@ -15,18 +15,20 @@
//! Structs and functions for reading ranges from a parquet file. A file range
//! is usually a row group in a parquet file.
use std::collections::HashMap;
use std::ops::BitAnd;
use std::sync::Arc;
use api::v1::{OpType, SemanticType};
use common_telemetry::error;
use datatypes::arrow::array::BooleanArray;
use datatypes::arrow::array::{ArrayRef, BooleanArray};
use datatypes::arrow::buffer::BooleanBuffer;
use datatypes::arrow::record_batch::RecordBatch;
use mito_codec::row_converter::{CompositeValues, PrimaryKeyCodec};
use parquet::arrow::arrow_reader::RowSelection;
use snafu::{OptionExt, ResultExt};
use store_api::storage::TimeSeriesRowSelector;
use store_api::codec::PrimaryKeyEncoding;
use store_api::storage::{ColumnId, TimeSeriesRowSelector};
use crate::error::{
ComputeArrowSnafu, DataTypeMismatchSnafu, DecodeSnafu, DecodeStatsSnafu, RecordBatchSnafu,
@@ -37,11 +39,11 @@ use crate::read::compat::CompatBatch;
use crate::read::last_row::RowGroupLastRowCachedReader;
use crate::read::prune::{FlatPruneReader, PruneReader};
use crate::sst::file::FileHandle;
use crate::sst::parquet::flat_format::{DecodedPrimaryKeys, decode_primary_keys};
use crate::sst::parquet::format::ReadFormat;
use crate::sst::parquet::reader::{
FlatRowGroupReader, MaybeFilter, RowGroupReader, RowGroupReaderBuilder, SimpleFilterContext,
};
/// A range of a parquet SST. Now it is a row group.
/// We can read different file ranges in parallel.
#[derive(Clone)]
@@ -357,7 +359,34 @@ impl RangeBase {
}
/// Filters the input RecordBatch by the pushed down predicate and returns RecordBatch.
///
/// It assumes all necessary tags are already decoded from the primary key.
pub(crate) fn precise_filter_flat(&self, input: RecordBatch) -> Result<Option<RecordBatch>> {
let mask = self.compute_filter_mask_flat(&input)?;
// If mask is None, the entire batch is filtered out
let Some(mask) = mask else {
return Ok(None);
};
let filtered_batch =
datatypes::arrow::compute::filter_record_batch(&input, &BooleanArray::from(mask))
.context(ComputeArrowSnafu)?;
if filtered_batch.num_rows() > 0 {
Ok(Some(filtered_batch))
} else {
Ok(None)
}
}
/// Computes the filter mask for the input RecordBatch based on pushed down predicates.
///
/// Returns `None` if the entire batch is filtered out, otherwise returns the boolean mask.
pub(crate) fn compute_filter_mask_flat(
&self,
input: &RecordBatch,
) -> Result<Option<BooleanBuffer>> {
let mut mask = BooleanBuffer::new_set(input.num_rows());
let flat_format = self
@@ -367,6 +396,11 @@ impl RangeBase {
reason: "Expected flat format for precise_filter_flat",
})?;
// Decodes primary keys once if we have any tag filters not in projection
let mut decoded_pks: Option<DecodedPrimaryKeys> = None;
// Cache decoded tag arrays by column id to avoid redundant decoding
let mut decoded_tag_cache: HashMap<ColumnId, ArrayRef> = HashMap::new();
// Run filter one by one and combine them result
for filter_ctx in &self.filters {
let filter = match filter_ctx.filter() {
@@ -383,20 +417,53 @@ impl RangeBase {
let column = &input.columns()[idx];
let result = filter.evaluate_array(column).context(RecordBatchSnafu)?;
mask = mask.bitand(&result);
} else {
// Column not found in projection, continue
continue;
} else if filter_ctx.semantic_type() == SemanticType::Tag {
// Column not found in projection, it may be a tag column.
// Decodes primary keys if not already decoded.
if decoded_pks.is_none() {
decoded_pks = Some(decode_primary_keys(self.codec.as_ref(), input)?);
}
let metadata = flat_format.metadata();
let column_id = filter_ctx.column_id();
// Check cache first
let tag_column = if let Some(cached_column) = decoded_tag_cache.get(&column_id) {
cached_column.clone()
} else {
// For dense encoding, we need pk_index. For sparse encoding, pk_index is None.
let pk_index = if self.codec.encoding() == PrimaryKeyEncoding::Sparse {
None
} else {
metadata.primary_key_index(column_id)
};
let column_index = metadata.column_index_by_id(column_id);
if let (Some(column_index), Some(decoded)) =
(column_index, decoded_pks.as_ref())
{
let column_metadata = &metadata.column_metadatas[column_index];
let tag_column = decoded.get_tag_column(
column_id,
pk_index,
&column_metadata.column_schema.data_type,
)?;
// Cache the decoded tag column
decoded_tag_cache.insert(column_id, tag_column.clone());
tag_column
} else {
continue;
}
};
let result = filter
.evaluate_array(&tag_column)
.context(RecordBatchSnafu)?;
mask = mask.bitand(&result);
}
// Non-tag column not found in projection.
}
let filtered_batch =
datatypes::arrow::compute::filter_record_batch(&input, &BooleanArray::from(mask))
.context(ComputeArrowSnafu)?;
if filtered_batch.num_rows() > 0 {
Ok(Some(filtered_batch))
} else {
Ok(None)
}
Ok(Some(mask))
}
}

View File

@@ -127,7 +127,9 @@ pub(crate) fn op_type_column_index(num_columns: usize) -> usize {
num_columns - 1
}
// TODO(yingwen): Add an option to skip reading internal columns.
// TODO(yingwen): Add an option to skip reading internal columns if the region is
// append only and doesn't use sparse encoding (We need to check the table id under
// sparse encoding).
/// Helper for reading the flat SST format with projection.
///
/// It only supports flat format that stores primary keys additionally.
@@ -528,6 +530,125 @@ pub(crate) fn sst_column_id_indices(metadata: &RegionMetadata) -> HashMap<Column
id_to_index
}
/// Decodes primary keys from a batch and returns decoded primary key information.
///
/// The batch must contain a primary key column at the expected index.
pub(crate) fn decode_primary_keys(
codec: &dyn PrimaryKeyCodec,
batch: &RecordBatch,
) -> Result<DecodedPrimaryKeys> {
let primary_key_index = primary_key_column_index(batch.num_columns());
let pk_dict_array = batch
.column(primary_key_index)
.as_any()
.downcast_ref::<PrimaryKeyArray>()
.with_context(|| InvalidRecordBatchSnafu {
reason: "Primary key column is not a dictionary array".to_string(),
})?;
let pk_values_array = pk_dict_array
.values()
.as_any()
.downcast_ref::<BinaryArray>()
.with_context(|| InvalidRecordBatchSnafu {
reason: "Primary key values are not binary array".to_string(),
})?;
let keys = pk_dict_array.keys();
// Decodes primary key values by iterating through keys, reusing decoded values for duplicate keys.
// Maps original key index -> new decoded value index
let mut key_to_decoded_index = Vec::with_capacity(keys.len());
let mut decoded_pk_values = Vec::new();
let mut prev_key: Option<u32> = None;
// The parquet reader may read the whole dictionary page into the dictionary values, so
// we may decode many primary keys not in this batch if we decode the values array directly.
for i in 0..keys.len() {
let current_key = keys.value(i);
// Check if current key is the same as previous key
if let Some(prev) = prev_key
&& prev == current_key
{
// Reuse the last decoded index
key_to_decoded_index.push((decoded_pk_values.len() - 1) as u32);
continue;
}
// New key, decodes the value
let pk_bytes = pk_values_array.value(current_key as usize);
let decoded_value = codec.decode(pk_bytes).context(DecodeSnafu)?;
decoded_pk_values.push(decoded_value);
key_to_decoded_index.push((decoded_pk_values.len() - 1) as u32);
prev_key = Some(current_key);
}
// Create the keys array from key_to_decoded_index
let keys_array = UInt32Array::from(key_to_decoded_index);
Ok(DecodedPrimaryKeys {
decoded_pk_values,
keys_array,
})
}
/// Holds decoded primary key values and their indices.
pub(crate) struct DecodedPrimaryKeys {
/// Decoded primary key values for unique keys in the dictionary.
decoded_pk_values: Vec<CompositeValues>,
/// Prebuilt keys array for creating dictionary arrays.
keys_array: UInt32Array,
}
impl DecodedPrimaryKeys {
/// Gets a tag column array by column id and data type.
///
/// For sparse encoding, uses column_id to lookup values.
/// For dense encoding, uses pk_index to get values.
pub(crate) fn get_tag_column(
&self,
column_id: ColumnId,
pk_index: Option<usize>,
column_type: &ConcreteDataType,
) -> Result<ArrayRef> {
// Gets values from the primary key.
let mut builder = column_type.create_mutable_vector(self.decoded_pk_values.len());
for decoded in &self.decoded_pk_values {
match decoded {
CompositeValues::Dense(dense) => {
let pk_idx = pk_index.expect("pk_index required for dense encoding");
if pk_idx < dense.len() {
builder.push_value_ref(&dense[pk_idx].1.as_value_ref());
} else {
builder.push_null();
}
}
CompositeValues::Sparse(sparse) => {
let value = sparse.get_or_null(column_id);
builder.push_value_ref(&value.as_value_ref());
}
};
}
let values_vector = builder.to_vector();
let values_array = values_vector.to_arrow_array();
// Only creates dictionary array for string types, otherwise take values by keys
if column_type.is_string() {
// Creates dictionary array using the same keys for string types
// Note that the dictionary values may have nulls.
let dict_array = DictionaryArray::new(self.keys_array.clone(), values_array);
Ok(Arc::new(dict_array))
} else {
// For non-string types, takes values by keys indices to create a regular array
let taken_array =
take(&values_array, &self.keys_array, None).context(ComputeArrowSnafu)?;
Ok(taken_array)
}
}
}
/// Converts a batch that doesn't have decoded primary key columns into a batch that has decoded
/// primary key columns in flat format.
pub(crate) struct FlatConvertFormat {
@@ -577,53 +698,22 @@ impl FlatConvertFormat {
/// Converts a batch to have decoded primary key columns in flat format.
///
/// The primary key array in the batch is a dictionary array. We decode each value which is a
/// primary key and reuse the keys array to build a dictionary array for each tag column.
/// The decoded columns are inserted in front of other columns.
/// The primary key array in the batch is a dictionary array.
pub(crate) fn convert(&self, batch: RecordBatch) -> Result<RecordBatch> {
if self.projected_primary_keys.is_empty() {
return Ok(batch);
}
let primary_key_index = primary_key_column_index(batch.num_columns());
let pk_dict_array = batch
.column(primary_key_index)
.as_any()
.downcast_ref::<PrimaryKeyArray>()
.with_context(|| InvalidRecordBatchSnafu {
reason: "Primary key column is not a dictionary array".to_string(),
})?;
let pk_values_array = pk_dict_array
.values()
.as_any()
.downcast_ref::<BinaryArray>()
.with_context(|| InvalidRecordBatchSnafu {
reason: "Primary key values are not binary array".to_string(),
})?;
// Decodes all primary key values
let mut decoded_pk_values = Vec::with_capacity(pk_values_array.len());
for i in 0..pk_values_array.len() {
if pk_values_array.is_null(i) {
decoded_pk_values.push(None);
} else {
let pk_bytes = pk_values_array.value(i);
let decoded = self.codec.decode(pk_bytes).context(DecodeSnafu)?;
decoded_pk_values.push(Some(decoded));
}
}
let decoded_pks = decode_primary_keys(self.codec.as_ref(), &batch)?;
// Builds decoded tag column arrays.
let mut decoded_columns = Vec::new();
for (column_id, pk_index, column_index) in &self.projected_primary_keys {
let column_metadata = &self.metadata.column_metadatas[*column_index];
let tag_column = self.build_primary_key_column(
let tag_column = decoded_pks.get_tag_column(
*column_id,
*pk_index,
Some(*pk_index),
&column_metadata.column_schema.data_type,
pk_dict_array.keys(),
&decoded_pk_values,
)?;
decoded_columns.push(tag_column);
}
@@ -648,57 +738,6 @@ impl FlatConvertFormat {
let new_schema = Arc::new(Schema::new(new_fields));
RecordBatch::try_new(new_schema, new_columns).context(NewRecordBatchSnafu)
}
/// Builds an array for a specific tag column.
///
/// It may build a dictionary array if the type is string. Note that the dictionary
/// array may have null values, although keys are not null.
fn build_primary_key_column(
&self,
column_id: ColumnId,
pk_index: usize,
column_type: &ConcreteDataType,
keys: &UInt32Array,
decoded_pk_values: &[Option<CompositeValues>],
) -> Result<ArrayRef> {
// Gets values from the primary key.
let mut builder = column_type.create_mutable_vector(decoded_pk_values.len());
for decoded_opt in decoded_pk_values {
match decoded_opt {
Some(decoded) => {
match decoded {
CompositeValues::Dense(dense) => {
if pk_index < dense.len() {
builder.push_value_ref(&dense[pk_index].1.as_value_ref());
} else {
builder.push_null();
}
}
CompositeValues::Sparse(sparse) => {
let value = sparse.get_or_null(column_id);
builder.push_value_ref(&value.as_value_ref());
}
};
}
None => builder.push_null(),
}
}
let values_vector = builder.to_vector();
let values_array = values_vector.to_arrow_array();
// Only creates dictionary array for string types, otherwise take values by keys
if column_type.is_string() {
// Creates dictionary array using the same keys for string types
// Note that the dictionary values may have nulls.
let dict_array = DictionaryArray::new(keys.clone(), values_array);
Ok(Arc::new(dict_array))
} else {
// For non-string types, takes values by keys indices to create a regular array
let taken_array = take(&values_array, keys, None).context(ComputeArrowSnafu)?;
Ok(taken_array)
}
}
}
#[cfg(test)]

View File

@@ -1397,6 +1397,7 @@ impl FlatRowGroupReader {
let record_batch = batch_result.context(ArrowReaderSnafu {
path: self.context.file_path(),
})?;
// Safety: Only flat format use FlatRowGroupReader.
let flat_format = self.context.read_format().as_flat().unwrap();
let record_batch =