mito2/memtable/
partition_tree.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Memtable implementation based on a partition tree.
16
17pub(crate) mod data;
18mod dedup;
19mod dict;
20mod merger;
21mod partition;
22mod shard;
23mod shard_builder;
24mod tree;
25
26use std::fmt;
27use std::sync::Arc;
28use std::sync::atomic::{AtomicI64, AtomicU64, AtomicUsize, Ordering};
29
30use common_base::readable_size::ReadableSize;
31use common_stat::get_total_memory_readable;
32use mito_codec::key_values::KeyValue;
33use mito_codec::row_converter::{PrimaryKeyCodec, build_primary_key_codec};
34use serde::{Deserialize, Serialize};
35use store_api::metadata::RegionMetadataRef;
36use store_api::storage::{ColumnId, SequenceRange};
37use table::predicate::Predicate;
38
39use crate::error::{Result, UnsupportedOperationSnafu};
40use crate::flush::WriteBufferManagerRef;
41use crate::memtable::bulk::part::BulkPart;
42use crate::memtable::partition_tree::tree::PartitionTree;
43use crate::memtable::stats::WriteMetrics;
44use crate::memtable::{
45    AllocTracker, BatchToRecordBatchContext, BoxedBatchIterator, IterBuilder, KeyValues,
46    MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange, MemtableRangeContext,
47    MemtableRanges, MemtableRef, MemtableStats, RangesOptions, read_column_ids_from_projection,
48};
49use crate::region::options::MergeMode;
50
51/// Use `1/DICTIONARY_SIZE_FACTOR` of OS memory as dictionary size.
52pub(crate) const DICTIONARY_SIZE_FACTOR: u64 = 8;
53pub(crate) const DEFAULT_MAX_KEYS_PER_SHARD: usize = 8192;
54pub(crate) const DEFAULT_FREEZE_THRESHOLD: usize = 131072;
55
56/// Id of a shard, only unique inside a partition.
57type ShardId = u32;
58/// Index of a primary key in a shard.
59type PkIndex = u16;
60
61/// Id of a primary key inside a tree.
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63struct PkId {
64    shard_id: ShardId,
65    pk_index: PkIndex,
66}
67
68// TODO(yingwen): `fork_dictionary_bytes` is per region option, if we have multiple partition tree
69// memtable then we will use a lot memory. We should find a better way to control the
70// dictionary size.
71/// Config for the partition tree memtable.
72#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
73#[serde(default)]
74pub struct PartitionTreeConfig {
75    /// Max keys in an index shard.
76    pub index_max_keys_per_shard: usize,
77    /// Number of rows to freeze a data part.
78    pub data_freeze_threshold: usize,
79    /// Whether to delete duplicates rows.
80    ///
81    /// Skips deserializing as it should be determined by whether the
82    /// table is append only.
83    #[serde(skip_deserializing)]
84    pub dedup: bool,
85    /// Total bytes of dictionary to keep in fork.
86    pub fork_dictionary_bytes: ReadableSize,
87    /// Merge mode of the tree.
88    #[serde(skip_deserializing)]
89    pub merge_mode: MergeMode,
90}
91
92impl Default for PartitionTreeConfig {
93    fn default() -> Self {
94        let mut fork_dictionary_bytes = ReadableSize::mb(512);
95        if let Some(total_memory) = get_total_memory_readable() {
96            let adjust_dictionary_bytes =
97                std::cmp::min(total_memory / DICTIONARY_SIZE_FACTOR, fork_dictionary_bytes);
98            if adjust_dictionary_bytes.0 > 0 {
99                fork_dictionary_bytes = adjust_dictionary_bytes;
100            }
101        }
102
103        Self {
104            index_max_keys_per_shard: 8192,
105            data_freeze_threshold: 131072,
106            dedup: true,
107            fork_dictionary_bytes,
108            merge_mode: MergeMode::LastRow,
109        }
110    }
111}
112
113/// Memtable based on a partition tree.
114pub struct PartitionTreeMemtable {
115    id: MemtableId,
116    tree: Arc<PartitionTree>,
117    alloc_tracker: AllocTracker,
118    max_timestamp: AtomicI64,
119    min_timestamp: AtomicI64,
120    max_sequence: AtomicU64,
121    /// Total written rows in memtable. This also includes deleted and duplicated rows.
122    num_rows: AtomicUsize,
123}
124
125impl fmt::Debug for PartitionTreeMemtable {
126    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
127        f.debug_struct("PartitionTreeMemtable")
128            .field("id", &self.id)
129            .finish()
130    }
131}
132
133impl Memtable for PartitionTreeMemtable {
134    fn id(&self) -> MemtableId {
135        self.id
136    }
137
138    fn write(&self, kvs: &KeyValues) -> Result<()> {
139        if kvs.is_empty() {
140            return Ok(());
141        }
142
143        // TODO(yingwen): Validate schema while inserting rows.
144
145        let mut metrics = WriteMetrics::default();
146        let mut pk_buffer = Vec::new();
147        // Ensures the memtable always updates stats.
148        let res = self.tree.write(kvs, &mut pk_buffer, &mut metrics);
149
150        if res.is_ok() {
151            metrics.max_sequence = kvs.max_sequence();
152            metrics.num_rows = kvs.num_rows();
153            self.update_stats(&metrics);
154        }
155        res
156    }
157
158    fn write_one(&self, key_value: KeyValue) -> Result<()> {
159        let mut metrics = WriteMetrics::default();
160        let mut pk_buffer = Vec::new();
161        // Ensures the memtable always updates stats.
162        let res = self.tree.write_one(key_value, &mut pk_buffer, &mut metrics);
163
164        // update max_sequence
165        if res.is_ok() {
166            metrics.max_sequence = metrics.max_sequence.max(key_value.sequence());
167            metrics.num_rows = 1;
168            self.update_stats(&metrics);
169        }
170        res
171    }
172
173    fn write_bulk(&self, _part: BulkPart) -> Result<()> {
174        UnsupportedOperationSnafu {
175            err_msg: "PartitionTreeMemtable does not support write_bulk",
176        }
177        .fail()
178    }
179
180    #[cfg(any(test, feature = "test"))]
181    fn iter(
182        &self,
183        projection: Option<&[ColumnId]>,
184        predicate: Option<Predicate>,
185        sequence: Option<SequenceRange>,
186    ) -> Result<BoxedBatchIterator> {
187        self.tree.read(projection, predicate, sequence, None)
188    }
189
190    fn ranges(
191        &self,
192        projection: Option<&[ColumnId]>,
193        options: RangesOptions,
194    ) -> Result<MemtableRanges> {
195        let predicate = options.predicate;
196        let sequence = options.sequence;
197        let read_column_ids = read_column_ids_from_projection(&self.tree.metadata, projection);
198        let projection = projection.map(|ids| ids.to_vec());
199        let builder = Box::new(PartitionTreeIterBuilder {
200            tree: self.tree.clone(),
201            projection,
202            predicate: predicate.predicate().cloned(),
203            sequence,
204        });
205        let adapter_context = Arc::new(BatchToRecordBatchContext::new(
206            self.tree.metadata.clone(),
207            read_column_ids,
208        ));
209        let context = Arc::new(MemtableRangeContext::new_with_batch_to_record_batch(
210            self.id,
211            builder,
212            predicate,
213            Some(adapter_context),
214        ));
215
216        let range_stats = self.stats();
217        let range = MemtableRange::new(context, range_stats);
218        Ok(MemtableRanges {
219            ranges: [(0, range)].into(),
220        })
221    }
222
223    fn is_empty(&self) -> bool {
224        self.tree.is_empty()
225    }
226
227    fn freeze(&self) -> Result<()> {
228        self.alloc_tracker.done_allocating();
229
230        self.tree.freeze()
231    }
232
233    fn stats(&self) -> MemtableStats {
234        let estimated_bytes = self.alloc_tracker.bytes_allocated();
235
236        if estimated_bytes == 0 {
237            // no rows ever written
238            return MemtableStats {
239                estimated_bytes,
240                time_range: None,
241                num_rows: 0,
242                num_ranges: 0,
243                max_sequence: 0,
244                series_count: 0,
245            };
246        }
247
248        let ts_type = self
249            .tree
250            .metadata
251            .time_index_column()
252            .column_schema
253            .data_type
254            .clone()
255            .as_timestamp()
256            .expect("Timestamp column must have timestamp type");
257        let max_timestamp = ts_type.create_timestamp(self.max_timestamp.load(Ordering::Relaxed));
258        let min_timestamp = ts_type.create_timestamp(self.min_timestamp.load(Ordering::Relaxed));
259        let series_count = self.tree.series_count();
260        MemtableStats {
261            estimated_bytes,
262            time_range: Some((min_timestamp, max_timestamp)),
263            num_rows: self.num_rows.load(Ordering::Relaxed),
264            num_ranges: 1,
265            max_sequence: self.max_sequence.load(Ordering::Relaxed),
266            series_count,
267        }
268    }
269
270    fn fork(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef {
271        let tree = self.tree.fork(metadata.clone());
272
273        let memtable = PartitionTreeMemtable::with_tree(id, tree);
274        Arc::new(memtable)
275    }
276}
277
278impl PartitionTreeMemtable {
279    /// Returns a new memtable.
280    pub fn new(
281        id: MemtableId,
282        row_codec: Arc<dyn PrimaryKeyCodec>,
283        metadata: RegionMetadataRef,
284        write_buffer_manager: Option<WriteBufferManagerRef>,
285        config: &PartitionTreeConfig,
286    ) -> Self {
287        Self::with_tree(
288            id,
289            PartitionTree::new(row_codec, metadata, config, write_buffer_manager.clone()),
290        )
291    }
292
293    /// Creates a mutable memtable from the tree.
294    ///
295    /// It also adds the bytes used by shared parts (e.g. index) to the memory usage.
296    fn with_tree(id: MemtableId, tree: PartitionTree) -> Self {
297        let alloc_tracker = AllocTracker::new(tree.write_buffer_manager());
298
299        Self {
300            id,
301            tree: Arc::new(tree),
302            alloc_tracker,
303            max_timestamp: AtomicI64::new(i64::MIN),
304            min_timestamp: AtomicI64::new(i64::MAX),
305            num_rows: AtomicUsize::new(0),
306            max_sequence: AtomicU64::new(0),
307        }
308    }
309
310    /// Updates stats of the memtable.
311    fn update_stats(&self, metrics: &WriteMetrics) {
312        // Only let the tracker tracks value bytes.
313        self.alloc_tracker.on_allocation(metrics.value_bytes);
314        self.max_timestamp
315            .fetch_max(metrics.max_ts, Ordering::SeqCst);
316        self.min_timestamp
317            .fetch_min(metrics.min_ts, Ordering::SeqCst);
318        self.num_rows.fetch_add(metrics.num_rows, Ordering::SeqCst);
319        self.max_sequence
320            .fetch_max(metrics.max_sequence, Ordering::SeqCst);
321    }
322
323    #[cfg(any(test, feature = "test"))]
324    pub fn iter(
325        &self,
326        projection: Option<&[ColumnId]>,
327        predicate: Option<Predicate>,
328        sequence: Option<SequenceRange>,
329    ) -> Result<BoxedBatchIterator> {
330        self.tree.read(projection, predicate, sequence, None)
331    }
332}
333
334/// Builder to build a [PartitionTreeMemtable].
335#[derive(Debug, Default)]
336pub struct PartitionTreeMemtableBuilder {
337    config: PartitionTreeConfig,
338    write_buffer_manager: Option<WriteBufferManagerRef>,
339}
340
341impl PartitionTreeMemtableBuilder {
342    /// Creates a new builder with specific `write_buffer_manager`.
343    pub fn new(
344        config: PartitionTreeConfig,
345        write_buffer_manager: Option<WriteBufferManagerRef>,
346    ) -> Self {
347        Self {
348            config,
349            write_buffer_manager,
350        }
351    }
352}
353
354impl MemtableBuilder for PartitionTreeMemtableBuilder {
355    fn build(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef {
356        let codec = build_primary_key_codec(metadata);
357        Arc::new(PartitionTreeMemtable::new(
358            id,
359            codec,
360            metadata.clone(),
361            self.write_buffer_manager.clone(),
362            &self.config,
363        ))
364    }
365
366    fn use_bulk_insert(&self, _metadata: &RegionMetadataRef) -> bool {
367        false
368    }
369}
370
371struct PartitionTreeIterBuilder {
372    tree: Arc<PartitionTree>,
373    projection: Option<Vec<ColumnId>>,
374    predicate: Option<Predicate>,
375    sequence: Option<SequenceRange>,
376}
377
378impl IterBuilder for PartitionTreeIterBuilder {
379    fn build(&self, metrics: Option<MemScanMetrics>) -> Result<BoxedBatchIterator> {
380        self.tree.read(
381            self.projection.as_deref(),
382            self.predicate.clone(),
383            self.sequence,
384            metrics,
385        )
386    }
387}
388
389#[cfg(test)]
390mod tests {
391    use std::collections::HashMap;
392    use std::sync::Arc;
393
394    use api::v1::helper::{field_column_schema, row, tag_column_schema, time_index_column_schema};
395    use api::v1::value::ValueData;
396    use api::v1::{Mutation, OpType, Rows, SemanticType};
397    use common_query::prelude::{greptime_timestamp, greptime_value};
398    use common_time::Timestamp;
399    use datafusion_common::Column;
400    use datafusion_expr::{BinaryExpr, Expr, Literal, Operator};
401    use datatypes::data_type::ConcreteDataType;
402    use datatypes::prelude::Vector;
403    use datatypes::scalars::ScalarVector;
404    use datatypes::schema::ColumnSchema;
405    use datatypes::value::Value;
406    use datatypes::vectors::{Int64Vector, StringVector};
407    use mito_codec::row_converter::DensePrimaryKeyCodec;
408    use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
409    use store_api::storage::RegionId;
410
411    use super::*;
412    use crate::test_util::memtable_util::{
413        self, collect_iter_timestamps, region_metadata_to_row_schema,
414    };
415
416    #[test]
417    fn test_memtable_sorted_input() {
418        write_iter_sorted_input(true);
419        write_iter_sorted_input(false);
420    }
421
422    fn write_iter_sorted_input(has_pk: bool) {
423        let metadata = if has_pk {
424            Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true))
425        } else {
426            Arc::new(memtable_util::metadata_with_primary_key(vec![], false))
427        };
428        let timestamps = (0..100).collect::<Vec<_>>();
429        let kvs =
430            memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
431        let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
432        let memtable = PartitionTreeMemtable::new(
433            1,
434            codec,
435            metadata.clone(),
436            None,
437            &PartitionTreeConfig::default(),
438        );
439        memtable.write(&kvs).unwrap();
440
441        let expected_ts = kvs
442            .iter()
443            .map(|kv| {
444                kv.timestamp()
445                    .try_into_timestamp()
446                    .unwrap()
447                    .unwrap()
448                    .value()
449            })
450            .collect::<Vec<_>>();
451
452        let iter = memtable.iter(None, None, None).unwrap();
453        let read = collect_iter_timestamps(iter);
454        assert_eq!(expected_ts, read);
455
456        let stats = memtable.stats();
457        assert!(stats.bytes_allocated() > 0);
458        assert_eq!(
459            Some((
460                Timestamp::new_millisecond(0),
461                Timestamp::new_millisecond(99)
462            )),
463            stats.time_range()
464        );
465    }
466
467    #[test]
468    fn test_memtable_unsorted_input() {
469        write_iter_unsorted_input(true);
470        write_iter_unsorted_input(false);
471    }
472
473    fn write_iter_unsorted_input(has_pk: bool) {
474        let metadata = if has_pk {
475            Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true))
476        } else {
477            Arc::new(memtable_util::metadata_with_primary_key(vec![], false))
478        };
479        let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
480        let memtable = PartitionTreeMemtable::new(
481            1,
482            codec,
483            metadata.clone(),
484            None,
485            &PartitionTreeConfig::default(),
486        );
487
488        let kvs = memtable_util::build_key_values(
489            &metadata,
490            "hello".to_string(),
491            0,
492            &[1, 3, 7, 5, 6],
493            0, // sequence 0, 1, 2, 3, 4
494        );
495        memtable.write(&kvs).unwrap();
496
497        let kvs = memtable_util::build_key_values(
498            &metadata,
499            "hello".to_string(),
500            0,
501            &[5, 2, 4, 0, 7],
502            5, // sequence 5, 6, 7, 8, 9
503        );
504        memtable.write(&kvs).unwrap();
505
506        let iter = memtable.iter(None, None, None).unwrap();
507        let read = collect_iter_timestamps(iter);
508        assert_eq!(vec![0, 1, 2, 3, 4, 5, 6, 7], read);
509
510        let iter = memtable.iter(None, None, None).unwrap();
511        let read = iter
512            .flat_map(|batch| {
513                batch
514                    .unwrap()
515                    .sequences()
516                    .iter_data()
517                    .collect::<Vec<_>>()
518                    .into_iter()
519            })
520            .map(|v| v.unwrap())
521            .collect::<Vec<_>>();
522        assert_eq!(vec![8, 0, 6, 1, 7, 5, 4, 9], read);
523
524        let stats = memtable.stats();
525        assert!(stats.bytes_allocated() > 0);
526        assert_eq!(
527            Some((Timestamp::new_millisecond(0), Timestamp::new_millisecond(7))),
528            stats.time_range()
529        );
530    }
531
532    #[test]
533    fn test_memtable_projection() {
534        write_iter_projection(true);
535        write_iter_projection(false);
536    }
537
538    fn write_iter_projection(has_pk: bool) {
539        let metadata = if has_pk {
540            Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true))
541        } else {
542            Arc::new(memtable_util::metadata_with_primary_key(vec![], false))
543        };
544        // Try to build a memtable via the builder.
545        let memtable = PartitionTreeMemtableBuilder::new(PartitionTreeConfig::default(), None)
546            .build(1, &metadata);
547
548        let expect = (0..100).collect::<Vec<_>>();
549        let kvs = memtable_util::build_key_values(&metadata, "hello".to_string(), 10, &expect, 1);
550        memtable.write(&kvs).unwrap();
551        let iter = memtable.iter(Some(&[3]), None, None).unwrap();
552
553        let mut v0_all = vec![];
554        for res in iter {
555            let batch = res.unwrap();
556            assert_eq!(1, batch.fields().len());
557            let v0 = batch
558                .fields()
559                .first()
560                .unwrap()
561                .data
562                .as_any()
563                .downcast_ref::<Int64Vector>()
564                .unwrap();
565            v0_all.extend(v0.iter_data().map(|v| v.unwrap()));
566        }
567        assert_eq!(expect, v0_all);
568    }
569
570    #[test]
571    fn test_write_iter_multi_keys() {
572        write_iter_multi_keys(1, 100);
573        write_iter_multi_keys(2, 100);
574        write_iter_multi_keys(4, 100);
575        write_iter_multi_keys(8, 5);
576        write_iter_multi_keys(2, 10);
577    }
578
579    fn write_iter_multi_keys(max_keys: usize, freeze_threshold: usize) {
580        let metadata = Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true));
581        let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
582        let memtable = PartitionTreeMemtable::new(
583            1,
584            codec,
585            metadata.clone(),
586            None,
587            &PartitionTreeConfig {
588                index_max_keys_per_shard: max_keys,
589                data_freeze_threshold: freeze_threshold,
590                ..Default::default()
591            },
592        );
593
594        let mut data = Vec::new();
595        // 4 partitions, each partition 4 pks.
596        for i in 0..4 {
597            for j in 0..4 {
598                // key: i, a{j}
599                let timestamps = [11, 13, 1, 5, 3, 7, 9];
600                let key = format!("a{j}");
601                let kvs =
602                    memtable_util::build_key_values(&metadata, key.clone(), i, &timestamps, 0);
603                memtable.write(&kvs).unwrap();
604                for ts in timestamps {
605                    data.push((i, key.clone(), ts));
606                }
607            }
608            for j in 0..4 {
609                // key: i, a{j}
610                let timestamps = [10, 2, 4, 8, 6];
611                let key = format!("a{j}");
612                let kvs =
613                    memtable_util::build_key_values(&metadata, key.clone(), i, &timestamps, 200);
614                memtable.write(&kvs).unwrap();
615                for ts in timestamps {
616                    data.push((i, key.clone(), ts));
617                }
618            }
619        }
620        data.sort_unstable();
621
622        let expect = data.into_iter().map(|x| x.2).collect::<Vec<_>>();
623        let iter = memtable.iter(None, None, None).unwrap();
624        let read = collect_iter_timestamps(iter);
625        assert_eq!(expect, read);
626    }
627
628    #[test]
629    fn test_memtable_filter() {
630        let metadata = Arc::new(memtable_util::metadata_with_primary_key(vec![0, 1], false));
631        // Try to build a memtable via the builder.
632        let memtable = PartitionTreeMemtableBuilder::new(
633            PartitionTreeConfig {
634                index_max_keys_per_shard: 40,
635                ..Default::default()
636            },
637            None,
638        )
639        .build(1, &metadata);
640
641        for i in 0..100 {
642            let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
643            let kvs =
644                memtable_util::build_key_values(&metadata, "hello".to_string(), i, &timestamps, 1);
645            memtable.write(&kvs).unwrap();
646        }
647
648        for i in 0..100 {
649            let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
650            let expr = Expr::BinaryExpr(BinaryExpr {
651                left: Box::new(Expr::Column(Column::from_name("k1"))),
652                op: Operator::Eq,
653                right: Box::new((i as u32).lit()),
654            });
655            let iter = memtable
656                .iter(None, Some(Predicate::new(vec![expr])), None)
657                .unwrap();
658            let read = collect_iter_timestamps(iter);
659            assert_eq!(timestamps, read);
660        }
661    }
662
663    #[test]
664    fn test_deserialize_config() {
665        let config = PartitionTreeConfig {
666            dedup: false,
667            ..Default::default()
668        };
669        // Creates a json with dedup = false.
670        let json = serde_json::to_string(&config).unwrap();
671        let config: PartitionTreeConfig = serde_json::from_str(&json).unwrap();
672        assert!(config.dedup);
673        assert_eq!(PartitionTreeConfig::default(), config);
674    }
675
676    fn metadata_for_metric_engine() -> RegionMetadataRef {
677        let mut builder = RegionMetadataBuilder::new(RegionId::new(123, 456));
678        builder
679            .push_column_metadata(ColumnMetadata {
680                column_schema: ColumnSchema::new(
681                    "__table_id",
682                    ConcreteDataType::uint32_datatype(),
683                    false,
684                ),
685                semantic_type: SemanticType::Tag,
686                column_id: 2147483652,
687            })
688            .push_column_metadata(ColumnMetadata {
689                column_schema: ColumnSchema::new(
690                    "__tsid",
691                    ConcreteDataType::uint64_datatype(),
692                    false,
693                ),
694                semantic_type: SemanticType::Tag,
695                column_id: 2147483651,
696            })
697            .push_column_metadata(ColumnMetadata {
698                column_schema: ColumnSchema::new(
699                    "test_label",
700                    ConcreteDataType::string_datatype(),
701                    false,
702                ),
703                semantic_type: SemanticType::Tag,
704                column_id: 2,
705            })
706            .push_column_metadata(ColumnMetadata {
707                column_schema: ColumnSchema::new(
708                    greptime_timestamp(),
709                    ConcreteDataType::timestamp_millisecond_datatype(),
710                    false,
711                ),
712                semantic_type: SemanticType::Timestamp,
713                column_id: 0,
714            })
715            .push_column_metadata(ColumnMetadata {
716                column_schema: ColumnSchema::new(
717                    greptime_value(),
718                    ConcreteDataType::float64_datatype(),
719                    true,
720                ),
721                semantic_type: SemanticType::Field,
722                column_id: 1,
723            })
724            .primary_key(vec![2147483652, 2147483651, 2]);
725        let region_metadata = builder.build().unwrap();
726        Arc::new(region_metadata)
727    }
728
729    fn build_key_values(
730        metadata: RegionMetadataRef,
731        labels: &[&str],
732        table_id: &[u32],
733        ts_id: &[u64],
734        ts: &[i64],
735        values: &[f64],
736        sequence: u64,
737    ) -> KeyValues {
738        let column_schema = region_metadata_to_row_schema(&metadata);
739
740        let rows = ts
741            .iter()
742            .zip(table_id.iter())
743            .zip(ts_id.iter())
744            .zip(labels.iter())
745            .zip(values.iter())
746            .map(|((((ts, table_id), ts_id), label), val)| {
747                row(vec![
748                    ValueData::U32Value(*table_id),
749                    ValueData::U64Value(*ts_id),
750                    ValueData::StringValue(label.to_string()),
751                    ValueData::TimestampMillisecondValue(*ts),
752                    ValueData::F64Value(*val),
753                ])
754            })
755            .collect();
756        let mutation = api::v1::Mutation {
757            op_type: 1,
758            sequence,
759            rows: Some(Rows {
760                schema: column_schema,
761                rows,
762            }),
763            write_hint: None,
764        };
765        KeyValues::new(metadata.as_ref(), mutation).unwrap()
766    }
767
768    #[test]
769    fn test_write_freeze() {
770        let metadata = metadata_for_metric_engine();
771        let memtable = PartitionTreeMemtableBuilder::new(
772            PartitionTreeConfig {
773                index_max_keys_per_shard: 40,
774                ..Default::default()
775            },
776            None,
777        )
778        .build(1, &metadata);
779
780        let codec = DensePrimaryKeyCodec::new(&metadata);
781
782        memtable
783            .write(&build_key_values(
784                metadata.clone(),
785                &["daily", "10min", "daily", "10min"],
786                &[1025, 1025, 1025, 1025],
787                &[
788                    16442255374049317291,
789                    5686004715529701024,
790                    16442255374049317291,
791                    5686004715529701024,
792                ],
793                &[1712070000000, 1712717731000, 1712761200000, 1712761200000],
794                &[0.0, 0.0, 0.0, 0.0],
795                1,
796            ))
797            .unwrap();
798
799        memtable.freeze().unwrap();
800        let new_memtable = memtable.fork(2, &metadata);
801
802        new_memtable
803            .write(&build_key_values(
804                metadata.clone(),
805                &["10min"],
806                &[1025],
807                &[5686004715529701024],
808                &[1714643131000],
809                &[0.1],
810                2,
811            ))
812            .unwrap();
813
814        let mut reader = new_memtable.iter(None, None, None).unwrap();
815        let batch = reader.next().unwrap().unwrap();
816        let pk = codec.decode(batch.primary_key()).unwrap().into_dense();
817        if let Value::String(s) = &pk[2] {
818            assert_eq!("10min", s.as_utf8());
819        } else {
820            unreachable!()
821        }
822    }
823
824    fn kv_region_metadata() -> RegionMetadataRef {
825        let mut builder = RegionMetadataBuilder::new(RegionId::new(123, 456));
826        builder
827            .push_column_metadata(ColumnMetadata {
828                column_schema: ColumnSchema::new(
829                    "ts",
830                    ConcreteDataType::timestamp_millisecond_datatype(),
831                    false,
832                ),
833                semantic_type: SemanticType::Timestamp,
834                column_id: 0,
835            })
836            .push_column_metadata(ColumnMetadata {
837                column_schema: ColumnSchema::new("k", ConcreteDataType::string_datatype(), false),
838                semantic_type: SemanticType::Tag,
839                column_id: 1,
840            })
841            .push_column_metadata(ColumnMetadata {
842                column_schema: ColumnSchema::new("v", ConcreteDataType::string_datatype(), false),
843                semantic_type: SemanticType::Field,
844                column_id: 2,
845            })
846            .primary_key(vec![1]);
847        let region_metadata = builder.build().unwrap();
848        Arc::new(region_metadata)
849    }
850
851    fn kv_column_schemas() -> Vec<api::v1::ColumnSchema> {
852        vec![
853            time_index_column_schema("ts", api::v1::ColumnDataType::TimestampMillisecond),
854            tag_column_schema("k", api::v1::ColumnDataType::String),
855            field_column_schema("v", api::v1::ColumnDataType::String),
856        ]
857    }
858
859    fn key_values<T: AsRef<str>>(
860        metadata: &RegionMetadataRef,
861        keys: impl Iterator<Item = T>,
862    ) -> KeyValues {
863        let rows = keys
864            .map(|c| {
865                row(vec![
866                    ValueData::TimestampMillisecondValue(0),
867                    ValueData::StringValue(c.as_ref().to_string()),
868                    ValueData::StringValue(c.as_ref().to_string()),
869                ])
870            })
871            .collect();
872        let mutation = Mutation {
873            op_type: OpType::Put as i32,
874            sequence: 0,
875            rows: Some(Rows {
876                schema: kv_column_schemas(),
877                rows,
878            }),
879            write_hint: None,
880        };
881        KeyValues::new(metadata, mutation).unwrap()
882    }
883
884    fn collect_kvs(
885        iter: BoxedBatchIterator,
886        region_meta: &RegionMetadataRef,
887    ) -> HashMap<String, String> {
888        let decoder = DensePrimaryKeyCodec::new(region_meta);
889        let mut res = HashMap::new();
890        for v in iter {
891            let batch = v.unwrap();
892            let values = decoder.decode(batch.primary_key()).unwrap().into_dense();
893            let field_vector = batch.fields()[0]
894                .data
895                .as_any()
896                .downcast_ref::<StringVector>()
897                .unwrap();
898            for row in 0..batch.num_rows() {
899                res.insert(
900                    values[0].as_string().unwrap(),
901                    field_vector.get(row).as_string().unwrap(),
902                );
903            }
904        }
905        res
906    }
907
908    #[test]
909    fn test_reorder_insert_key_values() {
910        let metadata = kv_region_metadata();
911        let memtable = PartitionTreeMemtableBuilder::new(PartitionTreeConfig::default(), None)
912            .build(1, &metadata);
913
914        memtable
915            .write(&key_values(&metadata, ('a'..'h').map(|c| c.to_string())))
916            .unwrap();
917        memtable.freeze().unwrap();
918        assert_eq!(
919            collect_kvs(memtable.iter(None, None, None).unwrap(), &metadata),
920            ('a'..'h').map(|c| (c.to_string(), c.to_string())).collect()
921        );
922        let forked = memtable.fork(2, &metadata);
923
924        let keys = ["c", "f", "i", "h", "b", "e", "g"];
925        forked.write(&key_values(&metadata, keys.iter())).unwrap();
926        forked.freeze().unwrap();
927        assert_eq!(
928            collect_kvs(forked.iter(None, None, None).unwrap(), &metadata),
929            keys.iter()
930                .map(|c| (c.to_string(), c.to_string()))
931                .collect()
932        );
933
934        let forked2 = forked.fork(3, &metadata);
935
936        let keys = ["g", "e", "a", "f", "b", "c", "h"];
937        forked2.write(&key_values(&metadata, keys.iter())).unwrap();
938
939        let kvs = collect_kvs(forked2.iter(None, None, None).unwrap(), &metadata);
940        let expected = keys
941            .iter()
942            .map(|c| (c.to_string(), c.to_string()))
943            .collect::<HashMap<_, _>>();
944        assert_eq!(kvs, expected);
945    }
946
947    #[test]
948    fn test_build_record_batch_iter_from_memtable() {
949        let metadata = Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true));
950        let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
951        let memtable = PartitionTreeMemtable::new(
952            1,
953            codec,
954            metadata.clone(),
955            None,
956            &PartitionTreeConfig::default(),
957        );
958
959        let kvs =
960            memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &[1, 2, 3], 0);
961        memtable.write(&kvs).unwrap();
962
963        let read_column_ids: Vec<ColumnId> = metadata
964            .column_metadatas
965            .iter()
966            .map(|c| c.column_id)
967            .collect();
968        let ranges = memtable
969            .ranges(Some(&read_column_ids), RangesOptions::default())
970            .unwrap();
971        assert!(!ranges.ranges.is_empty());
972
973        let mut total_rows = 0;
974        for range in ranges.ranges.into_values() {
975            let mut iter = range.build_record_batch_iter(None, None).unwrap();
976            while let Some(rb) = iter.next().transpose().unwrap() {
977                total_rows += rb.num_rows();
978                let schema = rb.schema();
979                let column_names: Vec<_> =
980                    schema.fields().iter().map(|f| f.name().as_str()).collect();
981                assert_eq!(
982                    column_names,
983                    vec![
984                        "__table_id",
985                        "k0",
986                        "v0",
987                        "v1",
988                        "ts",
989                        "__primary_key",
990                        "__sequence",
991                        "__op_type",
992                    ]
993                );
994            }
995        }
996        assert_eq!(3, total_rows);
997    }
998
999    #[test]
1000    fn test_build_record_batch_iter_with_time_range() {
1001        let metadata = Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true));
1002        let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
1003        let memtable = PartitionTreeMemtable::new(
1004            1,
1005            codec,
1006            metadata.clone(),
1007            None,
1008            &PartitionTreeConfig::default(),
1009        );
1010
1011        let kvs = memtable_util::build_key_values(
1012            &metadata,
1013            "hello".to_string(),
1014            42,
1015            &[1, 2, 3, 4, 5],
1016            0,
1017        );
1018        memtable.write(&kvs).unwrap();
1019
1020        let read_column_ids: Vec<ColumnId> = metadata
1021            .column_metadatas
1022            .iter()
1023            .map(|c| c.column_id)
1024            .collect();
1025        let ranges = memtable
1026            .ranges(Some(&read_column_ids), RangesOptions::default())
1027            .unwrap();
1028        assert!(!ranges.ranges.is_empty());
1029
1030        let time_range = (Timestamp::new_millisecond(2), Timestamp::new_millisecond(4));
1031
1032        let mut total_rows = 0;
1033        let mut all_timestamps = Vec::new();
1034        for range in ranges.ranges.into_values() {
1035            let mut iter = range
1036                .build_record_batch_iter(Some(time_range), None)
1037                .unwrap();
1038            while let Some(rb) = iter.next().transpose().unwrap() {
1039                total_rows += rb.num_rows();
1040                // ts column is at index 4 (after __table_id, k0, v0, v1)
1041                let ts_col = rb
1042                    .column_by_name("ts")
1043                    .unwrap()
1044                    .as_any()
1045                    .downcast_ref::<datatypes::arrow::array::TimestampMillisecondArray>()
1046                    .unwrap();
1047                for i in 0..ts_col.len() {
1048                    all_timestamps.push(ts_col.value(i));
1049                }
1050            }
1051        }
1052        assert_eq!(3, total_rows);
1053        all_timestamps.sort();
1054        assert_eq!(vec![2, 3, 4], all_timestamps);
1055    }
1056}