1use std::collections::HashMap;
18use std::sync::Arc;
19
20use api::v1::SemanticType;
21use arrow_schema::DataType;
22use common_base::readable_size::ReadableSize;
23use datatypes::arrow::datatypes::{
24 DataType as ArrowDataType, Field, FieldRef, Fields, Schema, SchemaRef,
25};
26use datatypes::arrow::record_batch::RecordBatch;
27use datatypes::prelude::ConcreteDataType;
28use datatypes::timestamp::timestamp_array_to_primitive;
29use serde::{Deserialize, Serialize};
30use store_api::codec::PrimaryKeyEncoding;
31use store_api::metadata::RegionMetadata;
32use store_api::storage::consts::{
33 OP_TYPE_COLUMN_NAME, PRIMARY_KEY_COLUMN_NAME, SEQUENCE_COLUMN_NAME,
34};
35
36use crate::sst::parquet::flat_format::time_index_column_index;
37
38pub mod file;
39pub mod file_purger;
40pub mod file_ref;
41pub mod index;
42pub mod location;
43pub mod parquet;
44pub(crate) mod version;
45
46pub const DEFAULT_WRITE_BUFFER_SIZE: ReadableSize = ReadableSize::mb(8);
48
49pub const DEFAULT_WRITE_CONCURRENCY: usize = 8;
51
52#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, strum::EnumString)]
54#[serde(rename_all = "snake_case")]
55#[strum(serialize_all = "snake_case")]
56pub enum FormatType {
57 #[default]
59 PrimaryKey,
60 Flat,
62}
63
64pub const PARQUET_FIELD_ID_KEY: &str = "PARQUET:field_id";
66
67pub fn with_field_id(mut field: Field, column_id: u32) -> Field {
69 field
70 .metadata_mut()
71 .insert(PARQUET_FIELD_ID_KEY.to_string(), column_id.to_string());
72 field
73}
74
75pub fn to_sst_arrow_schema(metadata: &RegionMetadata) -> SchemaRef {
77 let fields = Fields::from_iter(
78 metadata
79 .schema
80 .arrow_schema()
81 .fields()
82 .iter()
83 .zip(&metadata.column_metadatas)
84 .filter_map(|(field, column_meta)| {
85 if column_meta.semantic_type == SemanticType::Field {
86 Some(Arc::new(with_field_id(
87 (**field).clone(),
88 column_meta.column_id,
89 )))
90 } else {
91 None
93 }
94 })
95 .chain([Arc::new(with_field_id(
96 (*metadata.time_index_field()).clone(),
97 metadata.time_index_column().column_id,
98 ))])
99 .chain(internal_fields()),
100 );
101
102 Arc::new(Schema::new(fields))
103}
104
105pub struct FlatSchemaOptions {
107 pub raw_pk_columns: bool,
109 pub string_pk_use_dict: bool,
113 pub concretized_json_types: HashMap<String, DataType>,
116}
117
118impl Default for FlatSchemaOptions {
119 fn default() -> Self {
120 Self {
121 raw_pk_columns: true,
122 string_pk_use_dict: true,
123 concretized_json_types: HashMap::new(),
124 }
125 }
126}
127
128impl FlatSchemaOptions {
129 pub fn from_encoding(encoding: PrimaryKeyEncoding) -> Self {
131 if encoding == PrimaryKeyEncoding::Dense {
132 Self::default()
133 } else {
134 Self {
135 raw_pk_columns: false,
136 string_pk_use_dict: false,
137 concretized_json_types: HashMap::new(),
138 }
139 }
140 }
141}
142
143pub fn to_flat_sst_arrow_schema(
153 metadata: &RegionMetadata,
154 options: &FlatSchemaOptions,
155) -> SchemaRef {
156 let num_fields = flat_sst_arrow_schema_column_num(metadata, options);
157 let mut fields = Vec::with_capacity(num_fields);
158 let schema = metadata.schema.arrow_schema();
159 if options.raw_pk_columns {
160 for pk_id in &metadata.primary_key {
161 let pk_index = metadata.column_index_by_id(*pk_id).unwrap();
162 let column_id = metadata.column_metadatas[pk_index].column_id;
163 if options.string_pk_use_dict {
164 let old_field = &schema.fields[pk_index];
165 let new_field = tag_maybe_to_dictionary_field(
166 &metadata.column_metadatas[pk_index].column_schema.data_type,
167 old_field,
168 );
169 let new_field = concretize_json_type(new_field, options);
170 fields.push(Arc::new(with_field_id((*new_field).clone(), column_id)));
171 }
172 }
173 }
174 let remaining_fields = schema
175 .fields()
176 .iter()
177 .zip(&metadata.column_metadatas)
178 .filter_map(|(field, column_meta)| {
179 if column_meta.semantic_type == SemanticType::Field {
180 let field = concretize_json_type(field.clone(), options);
181 Some(Arc::new(with_field_id(
182 Arc::unwrap_or_clone(field),
183 column_meta.column_id,
184 )))
185 } else {
186 None
187 }
188 })
189 .chain([Arc::new(with_field_id(
190 (*metadata.time_index_field()).clone(),
191 metadata.time_index_column().column_id,
192 ))])
193 .chain(internal_fields());
194 for field in remaining_fields {
195 fields.push(field);
196 }
197
198 Arc::new(Schema::new(fields))
199}
200
201fn concretize_json_type(field: Arc<Field>, options: &FlatSchemaOptions) -> Arc<Field> {
202 if let Some(data_type) = options.concretized_json_types.get(field.name()) {
203 let mut field = Arc::unwrap_or_clone(field);
204 field.set_data_type(data_type.clone());
205 Arc::new(field)
206 } else {
207 field
208 }
209}
210
211pub fn flat_sst_arrow_schema_column_num(
213 metadata: &RegionMetadata,
214 options: &FlatSchemaOptions,
215) -> usize {
216 if options.raw_pk_columns {
217 metadata.column_metadatas.len() + 3
218 } else {
219 metadata.column_metadatas.len() + 3 - metadata.primary_key.len()
220 }
221}
222
223fn to_dictionary_field(field: &Field) -> Field {
225 let mut new_field = Field::new_dictionary(
226 field.name(),
227 datatypes::arrow::datatypes::DataType::UInt32,
228 field.data_type().clone(),
229 field.is_nullable(),
230 );
231
232 if let Some(field_id) = field.metadata().get(PARQUET_FIELD_ID_KEY) {
234 new_field
235 .metadata_mut()
236 .insert(PARQUET_FIELD_ID_KEY.to_string(), field_id.clone());
237 }
238
239 new_field
240}
241
242pub(crate) fn tag_maybe_to_dictionary_field(
244 data_type: &ConcreteDataType,
245 field: &Arc<Field>,
246) -> Arc<Field> {
247 if data_type.is_string() {
248 Arc::new(to_dictionary_field(field))
249 } else {
250 field.clone()
251 }
252}
253
254pub(crate) fn internal_fields() -> [FieldRef; 3] {
256 [
258 Arc::new(Field::new_dictionary(
259 PRIMARY_KEY_COLUMN_NAME,
260 ArrowDataType::UInt32,
261 ArrowDataType::Binary,
262 false,
263 )),
264 Arc::new(Field::new(
265 SEQUENCE_COLUMN_NAME,
266 ArrowDataType::UInt64,
267 false,
268 )),
269 Arc::new(Field::new(OP_TYPE_COLUMN_NAME, ArrowDataType::UInt8, false)),
270 ]
271}
272
273#[derive(Default)]
278pub(crate) struct SeriesEstimator {
279 last_timestamp: Option<i64>,
281 series_count: u64,
283}
284
285impl SeriesEstimator {
286 pub(crate) fn update_flat(&mut self, record_batch: &RecordBatch) {
290 let batch_rows = record_batch.num_rows();
291 if batch_rows == 0 {
292 return;
293 }
294
295 let time_index_pos = time_index_column_index(record_batch.num_columns());
296 let timestamps = record_batch.column(time_index_pos);
297 let Some((ts_values, _unit)) = timestamp_array_to_primitive(timestamps) else {
298 return;
299 };
300 let values = ts_values.values();
301
302 if let Some(last_ts) = self.last_timestamp {
304 if values[0] <= last_ts {
305 self.series_count += 1;
306 }
307 } else {
308 self.series_count = 1;
310 }
311
312 for i in 0..batch_rows - 1 {
314 if values[i] >= values[i + 1] {
317 self.series_count += 1;
318 }
319 }
320
321 self.last_timestamp = Some(values[batch_rows - 1]);
323 }
324
325 pub(crate) fn finish(&mut self) -> u64 {
327 self.last_timestamp = None;
328 let count = self.series_count;
329 self.series_count = 0;
330
331 count
332 }
333}
334
335#[cfg(test)]
336mod tests {
337 use std::sync::Arc;
338
339 use datatypes::arrow::array::{
340 BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt32Array,
341 UInt64Array,
342 };
343 use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit};
344 use datatypes::arrow::record_batch::RecordBatch;
345
346 use super::*;
347
348 fn new_flat_record_batch(timestamps: &[i64]) -> RecordBatch {
349 let num_cols = 4; let time_index_pos = time_index_column_index(num_cols);
352 assert_eq!(time_index_pos, 0); let time_array = Arc::new(TimestampMillisecondArray::from(timestamps.to_vec()));
355 let pk_array = Arc::new(DictionaryArray::new(
356 UInt32Array::from(vec![0; timestamps.len()]),
357 Arc::new(BinaryArray::from(vec![b"test".as_slice()])),
358 ));
359 let seq_array = Arc::new(UInt64Array::from(vec![1; timestamps.len()]));
360 let op_array = Arc::new(UInt8Array::from(vec![1; timestamps.len()]));
361
362 let schema = Arc::new(Schema::new(vec![
363 Field::new(
364 "time",
365 ArrowDataType::Timestamp(TimeUnit::Millisecond, None),
366 false,
367 ),
368 Field::new_dictionary(
369 "__primary_key",
370 ArrowDataType::UInt32,
371 ArrowDataType::Binary,
372 false,
373 ),
374 Field::new("__sequence", ArrowDataType::UInt64, false),
375 Field::new("__op_type", ArrowDataType::UInt8, false),
376 ]));
377
378 RecordBatch::try_new(schema, vec![time_array, pk_array, seq_array, op_array]).unwrap()
379 }
380
381 #[test]
382 fn test_series_estimator_flat_empty_batch() {
383 let mut estimator = SeriesEstimator::default();
384 let record_batch = new_flat_record_batch(&[]);
385 estimator.update_flat(&record_batch);
386 assert_eq!(0, estimator.finish());
387 }
388
389 #[test]
390 fn test_series_estimator_flat_single_batch() {
391 let mut estimator = SeriesEstimator::default();
392 let record_batch = new_flat_record_batch(&[1, 2, 3]);
393 estimator.update_flat(&record_batch);
394 assert_eq!(1, estimator.finish());
395 }
396
397 #[test]
398 fn test_series_estimator_flat_series_boundary_within_batch() {
399 let mut estimator = SeriesEstimator::default();
400 let record_batch = new_flat_record_batch(&[1, 2, 3, 2, 4, 5]);
402 estimator.update_flat(&record_batch);
403 assert_eq!(2, estimator.finish());
405 }
406
407 #[test]
408 fn test_series_estimator_flat_multiple_boundaries_within_batch() {
409 let mut estimator = SeriesEstimator::default();
410 let record_batch = new_flat_record_batch(&[1, 2, 5, 4, 6, 3, 7]);
412 estimator.update_flat(&record_batch);
413 assert_eq!(3, estimator.finish());
414 }
415
416 #[test]
417 fn test_series_estimator_flat_equal_timestamps() {
418 let mut estimator = SeriesEstimator::default();
419 let record_batch = new_flat_record_batch(&[1, 2, 2, 3, 3, 3, 4]);
421 estimator.update_flat(&record_batch);
422 assert_eq!(4, estimator.finish());
424 }
425
426 #[test]
427 fn test_series_estimator_flat_multiple_batches_continuation() {
428 let mut estimator = SeriesEstimator::default();
429
430 let batch1 = new_flat_record_batch(&[1, 2, 3]);
432 estimator.update_flat(&batch1);
433
434 let batch2 = new_flat_record_batch(&[4, 5, 6]);
436 estimator.update_flat(&batch2);
437
438 assert_eq!(1, estimator.finish());
439 }
440
441 #[test]
442 fn test_series_estimator_flat_multiple_batches_new_series() {
443 let mut estimator = SeriesEstimator::default();
444
445 let batch1 = new_flat_record_batch(&[1, 2, 3]);
447 estimator.update_flat(&batch1);
448
449 let batch2 = new_flat_record_batch(&[2, 3, 4]);
451 estimator.update_flat(&batch2);
452
453 assert_eq!(2, estimator.finish());
454 }
455
456 #[test]
457 fn test_series_estimator_flat_boundary_at_batch_edge_equal() {
458 let mut estimator = SeriesEstimator::default();
459
460 let batch1 = new_flat_record_batch(&[1, 2, 5]);
462 estimator.update_flat(&batch1);
463
464 let batch2 = new_flat_record_batch(&[5, 6, 7]);
466 estimator.update_flat(&batch2);
467
468 assert_eq!(2, estimator.finish());
469 }
470
471 #[test]
472 fn test_series_estimator_flat_mixed_batches() {
473 let mut estimator = SeriesEstimator::default();
474
475 let batch1 = new_flat_record_batch(&[10, 20, 30]);
477 estimator.update_flat(&batch1);
478
479 let batch2 = new_flat_record_batch(&[5, 15, 10, 25]);
481 estimator.update_flat(&batch2);
482
483 let batch3 = new_flat_record_batch(&[30, 35]);
485 estimator.update_flat(&batch3);
486
487 assert_eq!(3, estimator.finish());
489 }
490
491 #[test]
492 fn test_series_estimator_flat_descending_timestamps() {
493 let mut estimator = SeriesEstimator::default();
494 let record_batch = new_flat_record_batch(&[10, 9, 8, 7, 6]);
496 estimator.update_flat(&record_batch);
497 assert_eq!(5, estimator.finish());
499 }
500
501 #[test]
502 fn test_series_estimator_flat_finish_resets_state() {
503 let mut estimator = SeriesEstimator::default();
504
505 let batch1 = new_flat_record_batch(&[1, 2, 3]);
506 estimator.update_flat(&batch1);
507
508 assert_eq!(1, estimator.finish());
509
510 let batch2 = new_flat_record_batch(&[4, 5, 6]);
512 estimator.update_flat(&batch2);
513
514 assert_eq!(1, estimator.finish());
515 }
516}