Skip to main content

mito2/
sst.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Sorted strings tables.
16
17use std::collections::HashMap;
18use std::sync::Arc;
19
20use api::v1::SemanticType;
21use arrow_schema::DataType;
22use common_base::readable_size::ReadableSize;
23use datatypes::arrow::datatypes::{
24    DataType as ArrowDataType, Field, FieldRef, Fields, Schema, SchemaRef,
25};
26use datatypes::arrow::record_batch::RecordBatch;
27use datatypes::prelude::ConcreteDataType;
28use datatypes::timestamp::timestamp_array_to_primitive;
29use serde::{Deserialize, Serialize};
30use store_api::codec::PrimaryKeyEncoding;
31use store_api::metadata::RegionMetadata;
32use store_api::storage::consts::{
33    OP_TYPE_COLUMN_NAME, PRIMARY_KEY_COLUMN_NAME, SEQUENCE_COLUMN_NAME,
34};
35
36use crate::sst::parquet::flat_format::time_index_column_index;
37
38pub mod file;
39pub mod file_purger;
40pub mod file_ref;
41pub mod index;
42pub mod location;
43pub mod parquet;
44pub(crate) mod version;
45
46/// Default write buffer size, it should be greater than the default minimum upload part of S3 (5mb).
47pub const DEFAULT_WRITE_BUFFER_SIZE: ReadableSize = ReadableSize::mb(8);
48
49/// Default number of concurrent write, it only works on object store backend(e.g., S3).
50pub const DEFAULT_WRITE_CONCURRENCY: usize = 8;
51
52/// Format type of the SST file.
53#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, strum::EnumString)]
54#[serde(rename_all = "snake_case")]
55#[strum(serialize_all = "snake_case")]
56pub enum FormatType {
57    /// Parquet with primary key encoded.
58    #[default]
59    PrimaryKey,
60    /// Flat Parquet format.
61    Flat,
62}
63
64/// Iceberg-compatible column field ID key stored in Parquet column metadata.
65pub const PARQUET_FIELD_ID_KEY: &str = "PARQUET:field_id";
66
67/// Adds `PARQUET:field_id` metadata to an Arrow field.
68pub fn with_field_id(mut field: Field, column_id: u32) -> Field {
69    field
70        .metadata_mut()
71        .insert(PARQUET_FIELD_ID_KEY.to_string(), column_id.to_string());
72    field
73}
74
75/// Gets the arrow schema to store in parquet.
76pub fn to_sst_arrow_schema(metadata: &RegionMetadata) -> SchemaRef {
77    let fields = Fields::from_iter(
78        metadata
79            .schema
80            .arrow_schema()
81            .fields()
82            .iter()
83            .zip(&metadata.column_metadatas)
84            .filter_map(|(field, column_meta)| {
85                if column_meta.semantic_type == SemanticType::Field {
86                    Some(Arc::new(with_field_id(
87                        (**field).clone(),
88                        column_meta.column_id,
89                    )))
90                } else {
91                    // We have fixed positions for tags (primary key) and time index.
92                    None
93                }
94            })
95            .chain([Arc::new(with_field_id(
96                (*metadata.time_index_field()).clone(),
97                metadata.time_index_column().column_id,
98            ))])
99            .chain(internal_fields()),
100    );
101
102    Arc::new(Schema::new(fields))
103}
104
105/// Options of flat schema.
106pub struct FlatSchemaOptions {
107    /// Whether to store primary key columns additionally instead of an encoded column.
108    pub raw_pk_columns: bool,
109    /// Whether to use dictionary encoding for string primary key columns
110    /// when storing primary key columns.
111    /// Only takes effect when `raw_pk_columns` is true.
112    pub string_pk_use_dict: bool,
113    /// The column's concretized JSON types, to be set into Arrow schema.
114    /// Otherwise it's empty struct in the Arrow schema.
115    pub concretized_json_types: HashMap<String, DataType>,
116}
117
118impl Default for FlatSchemaOptions {
119    fn default() -> Self {
120        Self {
121            raw_pk_columns: true,
122            string_pk_use_dict: true,
123            concretized_json_types: HashMap::new(),
124        }
125    }
126}
127
128impl FlatSchemaOptions {
129    /// Creates a options according to the primary key encoding.
130    pub fn from_encoding(encoding: PrimaryKeyEncoding) -> Self {
131        if encoding == PrimaryKeyEncoding::Dense {
132            Self::default()
133        } else {
134            Self {
135                raw_pk_columns: false,
136                string_pk_use_dict: false,
137                concretized_json_types: HashMap::new(),
138            }
139        }
140    }
141}
142
143/// Gets the arrow schema to store in parquet.
144///
145/// The schema is:
146/// ```text
147/// primary key columns, field columns, time index, __primary_key, __sequence, __op_type
148/// ```
149///
150/// # Panics
151/// Panics if the metadata is invalid.
152pub fn to_flat_sst_arrow_schema(
153    metadata: &RegionMetadata,
154    options: &FlatSchemaOptions,
155) -> SchemaRef {
156    let num_fields = flat_sst_arrow_schema_column_num(metadata, options);
157    let mut fields = Vec::with_capacity(num_fields);
158    let schema = metadata.schema.arrow_schema();
159    if options.raw_pk_columns {
160        for pk_id in &metadata.primary_key {
161            let pk_index = metadata.column_index_by_id(*pk_id).unwrap();
162            let column_id = metadata.column_metadatas[pk_index].column_id;
163            if options.string_pk_use_dict {
164                let old_field = &schema.fields[pk_index];
165                let new_field = tag_maybe_to_dictionary_field(
166                    &metadata.column_metadatas[pk_index].column_schema.data_type,
167                    old_field,
168                );
169                let new_field = concretize_json_type(new_field, options);
170                fields.push(Arc::new(with_field_id((*new_field).clone(), column_id)));
171            }
172        }
173    }
174    let remaining_fields = schema
175        .fields()
176        .iter()
177        .zip(&metadata.column_metadatas)
178        .filter_map(|(field, column_meta)| {
179            if column_meta.semantic_type == SemanticType::Field {
180                let field = concretize_json_type(field.clone(), options);
181                Some(Arc::new(with_field_id(
182                    Arc::unwrap_or_clone(field),
183                    column_meta.column_id,
184                )))
185            } else {
186                None
187            }
188        })
189        .chain([Arc::new(with_field_id(
190            (*metadata.time_index_field()).clone(),
191            metadata.time_index_column().column_id,
192        ))])
193        .chain(internal_fields());
194    for field in remaining_fields {
195        fields.push(field);
196    }
197
198    Arc::new(Schema::new(fields))
199}
200
201fn concretize_json_type(field: Arc<Field>, options: &FlatSchemaOptions) -> Arc<Field> {
202    if let Some(data_type) = options.concretized_json_types.get(field.name()) {
203        let mut field = Arc::unwrap_or_clone(field);
204        field.set_data_type(data_type.clone());
205        Arc::new(field)
206    } else {
207        field
208    }
209}
210
211/// Returns the number of columns in the flat format.
212pub fn flat_sst_arrow_schema_column_num(
213    metadata: &RegionMetadata,
214    options: &FlatSchemaOptions,
215) -> usize {
216    if options.raw_pk_columns {
217        metadata.column_metadatas.len() + 3
218    } else {
219        metadata.column_metadatas.len() + 3 - metadata.primary_key.len()
220    }
221}
222
223/// Helper function to create a dictionary field from a field.
224fn to_dictionary_field(field: &Field) -> Field {
225    let mut new_field = Field::new_dictionary(
226        field.name(),
227        datatypes::arrow::datatypes::DataType::UInt32,
228        field.data_type().clone(),
229        field.is_nullable(),
230    );
231
232    // retain field_id metadata
233    if let Some(field_id) = field.metadata().get(PARQUET_FIELD_ID_KEY) {
234        new_field
235            .metadata_mut()
236            .insert(PARQUET_FIELD_ID_KEY.to_string(), field_id.clone());
237    }
238
239    new_field
240}
241
242/// Helper function to create a dictionary field from a field if it is a string column.
243pub(crate) fn tag_maybe_to_dictionary_field(
244    data_type: &ConcreteDataType,
245    field: &Arc<Field>,
246) -> Arc<Field> {
247    if data_type.is_string() {
248        Arc::new(to_dictionary_field(field))
249    } else {
250        field.clone()
251    }
252}
253
254/// Fields for internal columns.
255pub(crate) fn internal_fields() -> [FieldRef; 3] {
256    // Internal columns are always not null.
257    [
258        Arc::new(Field::new_dictionary(
259            PRIMARY_KEY_COLUMN_NAME,
260            ArrowDataType::UInt32,
261            ArrowDataType::Binary,
262            false,
263        )),
264        Arc::new(Field::new(
265            SEQUENCE_COLUMN_NAME,
266            ArrowDataType::UInt64,
267            false,
268        )),
269        Arc::new(Field::new(OP_TYPE_COLUMN_NAME, ArrowDataType::UInt8, false)),
270    ]
271}
272
273/// Gets the estimated number of series from record batches.
274///
275/// This struct tracks the last timestamp value to detect series boundaries
276/// by observing when timestamps decrease (indicating a new series).
277#[derive(Default)]
278pub(crate) struct SeriesEstimator {
279    /// The last timestamp value seen
280    last_timestamp: Option<i64>,
281    /// The estimated number of series
282    series_count: u64,
283}
284
285impl SeriesEstimator {
286    /// Updates the estimator with a new record batch in flat format.
287    ///
288    /// This method examines the time index column to detect series boundaries.
289    pub(crate) fn update_flat(&mut self, record_batch: &RecordBatch) {
290        let batch_rows = record_batch.num_rows();
291        if batch_rows == 0 {
292            return;
293        }
294
295        let time_index_pos = time_index_column_index(record_batch.num_columns());
296        let timestamps = record_batch.column(time_index_pos);
297        let Some((ts_values, _unit)) = timestamp_array_to_primitive(timestamps) else {
298            return;
299        };
300        let values = ts_values.values();
301
302        // Checks if there's a boundary between the last batch and this batch
303        if let Some(last_ts) = self.last_timestamp {
304            if values[0] <= last_ts {
305                self.series_count += 1;
306            }
307        } else {
308            // First batch, counts as first series
309            self.series_count = 1;
310        }
311
312        // Counts series boundaries within this batch.
313        for i in 0..batch_rows - 1 {
314            // We assumes the same timestamp as a new series, which is different from
315            // how we split batches.
316            if values[i] >= values[i + 1] {
317                self.series_count += 1;
318            }
319        }
320
321        // Updates the last timestamp
322        self.last_timestamp = Some(values[batch_rows - 1]);
323    }
324
325    /// Returns the estimated number of series.
326    pub(crate) fn finish(&mut self) -> u64 {
327        self.last_timestamp = None;
328        let count = self.series_count;
329        self.series_count = 0;
330
331        count
332    }
333}
334
335#[cfg(test)]
336mod tests {
337    use std::sync::Arc;
338
339    use datatypes::arrow::array::{
340        BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt32Array,
341        UInt64Array,
342    };
343    use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit};
344    use datatypes::arrow::record_batch::RecordBatch;
345
346    use super::*;
347
348    fn new_flat_record_batch(timestamps: &[i64]) -> RecordBatch {
349        // Flat format has: [fields..., time_index, __primary_key, __sequence, __op_type]
350        let num_cols = 4; // time_index + 3 internal columns
351        let time_index_pos = time_index_column_index(num_cols);
352        assert_eq!(time_index_pos, 0); // For 4 columns, time index should be at position 0
353
354        let time_array = Arc::new(TimestampMillisecondArray::from(timestamps.to_vec()));
355        let pk_array = Arc::new(DictionaryArray::new(
356            UInt32Array::from(vec![0; timestamps.len()]),
357            Arc::new(BinaryArray::from(vec![b"test".as_slice()])),
358        ));
359        let seq_array = Arc::new(UInt64Array::from(vec![1; timestamps.len()]));
360        let op_array = Arc::new(UInt8Array::from(vec![1; timestamps.len()]));
361
362        let schema = Arc::new(Schema::new(vec![
363            Field::new(
364                "time",
365                ArrowDataType::Timestamp(TimeUnit::Millisecond, None),
366                false,
367            ),
368            Field::new_dictionary(
369                "__primary_key",
370                ArrowDataType::UInt32,
371                ArrowDataType::Binary,
372                false,
373            ),
374            Field::new("__sequence", ArrowDataType::UInt64, false),
375            Field::new("__op_type", ArrowDataType::UInt8, false),
376        ]));
377
378        RecordBatch::try_new(schema, vec![time_array, pk_array, seq_array, op_array]).unwrap()
379    }
380
381    #[test]
382    fn test_series_estimator_flat_empty_batch() {
383        let mut estimator = SeriesEstimator::default();
384        let record_batch = new_flat_record_batch(&[]);
385        estimator.update_flat(&record_batch);
386        assert_eq!(0, estimator.finish());
387    }
388
389    #[test]
390    fn test_series_estimator_flat_single_batch() {
391        let mut estimator = SeriesEstimator::default();
392        let record_batch = new_flat_record_batch(&[1, 2, 3]);
393        estimator.update_flat(&record_batch);
394        assert_eq!(1, estimator.finish());
395    }
396
397    #[test]
398    fn test_series_estimator_flat_series_boundary_within_batch() {
399        let mut estimator = SeriesEstimator::default();
400        // Timestamps decrease from 3 to 2, indicating a series boundary
401        let record_batch = new_flat_record_batch(&[1, 2, 3, 2, 4, 5]);
402        estimator.update_flat(&record_batch);
403        // Should detect boundary at position 3 (3 >= 2)
404        assert_eq!(2, estimator.finish());
405    }
406
407    #[test]
408    fn test_series_estimator_flat_multiple_boundaries_within_batch() {
409        let mut estimator = SeriesEstimator::default();
410        // Multiple series boundaries: 5>=4, 6>=3
411        let record_batch = new_flat_record_batch(&[1, 2, 5, 4, 6, 3, 7]);
412        estimator.update_flat(&record_batch);
413        assert_eq!(3, estimator.finish());
414    }
415
416    #[test]
417    fn test_series_estimator_flat_equal_timestamps() {
418        let mut estimator = SeriesEstimator::default();
419        // Equal timestamps are considered as new series
420        let record_batch = new_flat_record_batch(&[1, 2, 2, 3, 3, 3, 4]);
421        estimator.update_flat(&record_batch);
422        // Boundaries at: 2>=2, 3>=3, 3>=3
423        assert_eq!(4, estimator.finish());
424    }
425
426    #[test]
427    fn test_series_estimator_flat_multiple_batches_continuation() {
428        let mut estimator = SeriesEstimator::default();
429
430        // First batch: timestamps 1, 2, 3
431        let batch1 = new_flat_record_batch(&[1, 2, 3]);
432        estimator.update_flat(&batch1);
433
434        // Second batch: timestamps 4, 5, 6 (continuation)
435        let batch2 = new_flat_record_batch(&[4, 5, 6]);
436        estimator.update_flat(&batch2);
437
438        assert_eq!(1, estimator.finish());
439    }
440
441    #[test]
442    fn test_series_estimator_flat_multiple_batches_new_series() {
443        let mut estimator = SeriesEstimator::default();
444
445        // First batch: timestamps 1, 2, 3
446        let batch1 = new_flat_record_batch(&[1, 2, 3]);
447        estimator.update_flat(&batch1);
448
449        // Second batch: timestamps 2, 3, 4 (goes back to 2, new series)
450        let batch2 = new_flat_record_batch(&[2, 3, 4]);
451        estimator.update_flat(&batch2);
452
453        assert_eq!(2, estimator.finish());
454    }
455
456    #[test]
457    fn test_series_estimator_flat_boundary_at_batch_edge_equal() {
458        let mut estimator = SeriesEstimator::default();
459
460        // First batch ending at 5
461        let batch1 = new_flat_record_batch(&[1, 2, 5]);
462        estimator.update_flat(&batch1);
463
464        // Second batch starting at 5 (equal timestamp, new series)
465        let batch2 = new_flat_record_batch(&[5, 6, 7]);
466        estimator.update_flat(&batch2);
467
468        assert_eq!(2, estimator.finish());
469    }
470
471    #[test]
472    fn test_series_estimator_flat_mixed_batches() {
473        let mut estimator = SeriesEstimator::default();
474
475        // Batch 1: single series [10, 20, 30]
476        let batch1 = new_flat_record_batch(&[10, 20, 30]);
477        estimator.update_flat(&batch1);
478
479        // Batch 2: starts new series [5, 15], boundary within batch [15, 10, 25]
480        let batch2 = new_flat_record_batch(&[5, 15, 10, 25]);
481        estimator.update_flat(&batch2);
482
483        // Batch 3: continues from 25 to [30, 35]
484        let batch3 = new_flat_record_batch(&[30, 35]);
485        estimator.update_flat(&batch3);
486
487        // Expected: 1 (batch1) + 1 (batch2 start) + 1 (within batch2) = 3
488        assert_eq!(3, estimator.finish());
489    }
490
491    #[test]
492    fn test_series_estimator_flat_descending_timestamps() {
493        let mut estimator = SeriesEstimator::default();
494        // Strictly descending timestamps - each pair creates a boundary
495        let record_batch = new_flat_record_batch(&[10, 9, 8, 7, 6]);
496        estimator.update_flat(&record_batch);
497        // Boundaries: 10>=9, 9>=8, 8>=7, 7>=6 = 4 boundaries + 1 initial = 5 series
498        assert_eq!(5, estimator.finish());
499    }
500
501    #[test]
502    fn test_series_estimator_flat_finish_resets_state() {
503        let mut estimator = SeriesEstimator::default();
504
505        let batch1 = new_flat_record_batch(&[1, 2, 3]);
506        estimator.update_flat(&batch1);
507
508        assert_eq!(1, estimator.finish());
509
510        // After finish, state should be reset
511        let batch2 = new_flat_record_batch(&[4, 5, 6]);
512        estimator.update_flat(&batch2);
513
514        assert_eq!(1, estimator.finish());
515    }
516}