1pub(crate) mod data;
18mod dedup;
19mod dict;
20mod merger;
21mod partition;
22mod shard;
23mod shard_builder;
24mod tree;
25
26use std::fmt;
27use std::sync::Arc;
28use std::sync::atomic::{AtomicI64, AtomicU64, AtomicUsize, Ordering};
29
30use common_base::readable_size::ReadableSize;
31use common_stat::get_total_memory_readable;
32use mito_codec::key_values::KeyValue;
33use mito_codec::row_converter::{PrimaryKeyCodec, build_primary_key_codec};
34use serde::{Deserialize, Serialize};
35use store_api::metadata::RegionMetadataRef;
36use store_api::storage::{ColumnId, SequenceRange};
37use table::predicate::Predicate;
38
39use crate::error::{Result, UnsupportedOperationSnafu};
40use crate::flush::WriteBufferManagerRef;
41use crate::memtable::bulk::part::BulkPart;
42use crate::memtable::partition_tree::tree::PartitionTree;
43use crate::memtable::stats::WriteMetrics;
44use crate::memtable::{
45 AllocTracker, BatchToRecordBatchContext, BoxedBatchIterator, IterBuilder, KeyValues,
46 MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange, MemtableRangeContext,
47 MemtableRanges, MemtableRef, MemtableStats, RangesOptions, read_column_ids_from_projection,
48};
49use crate::region::options::MergeMode;
50
51pub(crate) const DICTIONARY_SIZE_FACTOR: u64 = 8;
53pub(crate) const DEFAULT_MAX_KEYS_PER_SHARD: usize = 8192;
54pub(crate) const DEFAULT_FREEZE_THRESHOLD: usize = 131072;
55
56type ShardId = u32;
58type PkIndex = u16;
60
61#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63struct PkId {
64 shard_id: ShardId,
65 pk_index: PkIndex,
66}
67
68#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
73#[serde(default)]
74pub struct PartitionTreeConfig {
75 pub index_max_keys_per_shard: usize,
77 pub data_freeze_threshold: usize,
79 #[serde(skip_deserializing)]
84 pub dedup: bool,
85 pub fork_dictionary_bytes: ReadableSize,
87 #[serde(skip_deserializing)]
89 pub merge_mode: MergeMode,
90}
91
92impl Default for PartitionTreeConfig {
93 fn default() -> Self {
94 let mut fork_dictionary_bytes = ReadableSize::mb(512);
95 if let Some(total_memory) = get_total_memory_readable() {
96 let adjust_dictionary_bytes =
97 std::cmp::min(total_memory / DICTIONARY_SIZE_FACTOR, fork_dictionary_bytes);
98 if adjust_dictionary_bytes.0 > 0 {
99 fork_dictionary_bytes = adjust_dictionary_bytes;
100 }
101 }
102
103 Self {
104 index_max_keys_per_shard: 8192,
105 data_freeze_threshold: 131072,
106 dedup: true,
107 fork_dictionary_bytes,
108 merge_mode: MergeMode::LastRow,
109 }
110 }
111}
112
113pub struct PartitionTreeMemtable {
115 id: MemtableId,
116 tree: Arc<PartitionTree>,
117 alloc_tracker: AllocTracker,
118 max_timestamp: AtomicI64,
119 min_timestamp: AtomicI64,
120 max_sequence: AtomicU64,
121 num_rows: AtomicUsize,
123}
124
125impl fmt::Debug for PartitionTreeMemtable {
126 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
127 f.debug_struct("PartitionTreeMemtable")
128 .field("id", &self.id)
129 .finish()
130 }
131}
132
133impl Memtable for PartitionTreeMemtable {
134 fn id(&self) -> MemtableId {
135 self.id
136 }
137
138 fn write(&self, kvs: &KeyValues) -> Result<()> {
139 if kvs.is_empty() {
140 return Ok(());
141 }
142
143 let mut metrics = WriteMetrics::default();
146 let mut pk_buffer = Vec::new();
147 let res = self.tree.write(kvs, &mut pk_buffer, &mut metrics);
149
150 if res.is_ok() {
151 metrics.max_sequence = kvs.max_sequence();
152 metrics.num_rows = kvs.num_rows();
153 self.update_stats(&metrics);
154 }
155 res
156 }
157
158 fn write_one(&self, key_value: KeyValue) -> Result<()> {
159 let mut metrics = WriteMetrics::default();
160 let mut pk_buffer = Vec::new();
161 let res = self.tree.write_one(key_value, &mut pk_buffer, &mut metrics);
163
164 if res.is_ok() {
166 metrics.max_sequence = metrics.max_sequence.max(key_value.sequence());
167 metrics.num_rows = 1;
168 self.update_stats(&metrics);
169 }
170 res
171 }
172
173 fn write_bulk(&self, _part: BulkPart) -> Result<()> {
174 UnsupportedOperationSnafu {
175 err_msg: "PartitionTreeMemtable does not support write_bulk",
176 }
177 .fail()
178 }
179
180 #[cfg(any(test, feature = "test"))]
181 fn iter(
182 &self,
183 projection: Option<&[ColumnId]>,
184 predicate: Option<Predicate>,
185 sequence: Option<SequenceRange>,
186 ) -> Result<BoxedBatchIterator> {
187 self.tree.read(projection, predicate, sequence, None)
188 }
189
190 fn ranges(
191 &self,
192 projection: Option<&[ColumnId]>,
193 options: RangesOptions,
194 ) -> Result<MemtableRanges> {
195 let predicate = options.predicate;
196 let sequence = options.sequence;
197 let read_column_ids = read_column_ids_from_projection(&self.tree.metadata, projection);
198 let projection = projection.map(|ids| ids.to_vec());
199 let builder = Box::new(PartitionTreeIterBuilder {
200 tree: self.tree.clone(),
201 projection,
202 predicate: predicate.predicate().cloned(),
203 sequence,
204 });
205 let adapter_context = Arc::new(BatchToRecordBatchContext::new(
206 self.tree.metadata.clone(),
207 read_column_ids,
208 ));
209 let context = Arc::new(MemtableRangeContext::new_with_batch_to_record_batch(
210 self.id,
211 builder,
212 predicate,
213 Some(adapter_context),
214 ));
215
216 let range_stats = self.stats();
217 let range = MemtableRange::new(context, range_stats);
218 Ok(MemtableRanges {
219 ranges: [(0, range)].into(),
220 })
221 }
222
223 fn is_empty(&self) -> bool {
224 self.tree.is_empty()
225 }
226
227 fn freeze(&self) -> Result<()> {
228 self.alloc_tracker.done_allocating();
229
230 self.tree.freeze()
231 }
232
233 fn stats(&self) -> MemtableStats {
234 let estimated_bytes = self.alloc_tracker.bytes_allocated();
235
236 if estimated_bytes == 0 {
237 return MemtableStats {
239 estimated_bytes,
240 time_range: None,
241 num_rows: 0,
242 num_ranges: 0,
243 max_sequence: 0,
244 series_count: 0,
245 };
246 }
247
248 let ts_type = self
249 .tree
250 .metadata
251 .time_index_column()
252 .column_schema
253 .data_type
254 .clone()
255 .as_timestamp()
256 .expect("Timestamp column must have timestamp type");
257 let max_timestamp = ts_type.create_timestamp(self.max_timestamp.load(Ordering::Relaxed));
258 let min_timestamp = ts_type.create_timestamp(self.min_timestamp.load(Ordering::Relaxed));
259 let series_count = self.tree.series_count();
260 MemtableStats {
261 estimated_bytes,
262 time_range: Some((min_timestamp, max_timestamp)),
263 num_rows: self.num_rows.load(Ordering::Relaxed),
264 num_ranges: 1,
265 max_sequence: self.max_sequence.load(Ordering::Relaxed),
266 series_count,
267 }
268 }
269
270 fn fork(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef {
271 let tree = self.tree.fork(metadata.clone());
272
273 let memtable = PartitionTreeMemtable::with_tree(id, tree);
274 Arc::new(memtable)
275 }
276}
277
278impl PartitionTreeMemtable {
279 pub fn new(
281 id: MemtableId,
282 row_codec: Arc<dyn PrimaryKeyCodec>,
283 metadata: RegionMetadataRef,
284 write_buffer_manager: Option<WriteBufferManagerRef>,
285 config: &PartitionTreeConfig,
286 ) -> Self {
287 Self::with_tree(
288 id,
289 PartitionTree::new(row_codec, metadata, config, write_buffer_manager.clone()),
290 )
291 }
292
293 fn with_tree(id: MemtableId, tree: PartitionTree) -> Self {
297 let alloc_tracker = AllocTracker::new(tree.write_buffer_manager());
298
299 Self {
300 id,
301 tree: Arc::new(tree),
302 alloc_tracker,
303 max_timestamp: AtomicI64::new(i64::MIN),
304 min_timestamp: AtomicI64::new(i64::MAX),
305 num_rows: AtomicUsize::new(0),
306 max_sequence: AtomicU64::new(0),
307 }
308 }
309
310 fn update_stats(&self, metrics: &WriteMetrics) {
312 self.alloc_tracker.on_allocation(metrics.value_bytes);
314 self.max_timestamp
315 .fetch_max(metrics.max_ts, Ordering::SeqCst);
316 self.min_timestamp
317 .fetch_min(metrics.min_ts, Ordering::SeqCst);
318 self.num_rows.fetch_add(metrics.num_rows, Ordering::SeqCst);
319 self.max_sequence
320 .fetch_max(metrics.max_sequence, Ordering::SeqCst);
321 }
322
323 #[cfg(any(test, feature = "test"))]
324 pub fn iter(
325 &self,
326 projection: Option<&[ColumnId]>,
327 predicate: Option<Predicate>,
328 sequence: Option<SequenceRange>,
329 ) -> Result<BoxedBatchIterator> {
330 self.tree.read(projection, predicate, sequence, None)
331 }
332}
333
334#[derive(Debug, Default)]
336pub struct PartitionTreeMemtableBuilder {
337 config: PartitionTreeConfig,
338 write_buffer_manager: Option<WriteBufferManagerRef>,
339}
340
341impl PartitionTreeMemtableBuilder {
342 pub fn new(
344 config: PartitionTreeConfig,
345 write_buffer_manager: Option<WriteBufferManagerRef>,
346 ) -> Self {
347 Self {
348 config,
349 write_buffer_manager,
350 }
351 }
352}
353
354impl MemtableBuilder for PartitionTreeMemtableBuilder {
355 fn build(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef {
356 let codec = build_primary_key_codec(metadata);
357 Arc::new(PartitionTreeMemtable::new(
358 id,
359 codec,
360 metadata.clone(),
361 self.write_buffer_manager.clone(),
362 &self.config,
363 ))
364 }
365
366 fn use_bulk_insert(&self, _metadata: &RegionMetadataRef) -> bool {
367 false
368 }
369}
370
371struct PartitionTreeIterBuilder {
372 tree: Arc<PartitionTree>,
373 projection: Option<Vec<ColumnId>>,
374 predicate: Option<Predicate>,
375 sequence: Option<SequenceRange>,
376}
377
378impl IterBuilder for PartitionTreeIterBuilder {
379 fn build(&self, metrics: Option<MemScanMetrics>) -> Result<BoxedBatchIterator> {
380 self.tree.read(
381 self.projection.as_deref(),
382 self.predicate.clone(),
383 self.sequence,
384 metrics,
385 )
386 }
387}
388
389#[cfg(test)]
390mod tests {
391 use std::collections::HashMap;
392 use std::sync::Arc;
393
394 use api::v1::helper::{field_column_schema, row, tag_column_schema, time_index_column_schema};
395 use api::v1::value::ValueData;
396 use api::v1::{Mutation, OpType, Rows, SemanticType};
397 use common_query::prelude::{greptime_timestamp, greptime_value};
398 use common_time::Timestamp;
399 use datafusion_common::Column;
400 use datafusion_expr::{BinaryExpr, Expr, Literal, Operator};
401 use datatypes::data_type::ConcreteDataType;
402 use datatypes::prelude::Vector;
403 use datatypes::scalars::ScalarVector;
404 use datatypes::schema::ColumnSchema;
405 use datatypes::value::Value;
406 use datatypes::vectors::{Int64Vector, StringVector};
407 use mito_codec::row_converter::DensePrimaryKeyCodec;
408 use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
409 use store_api::storage::RegionId;
410
411 use super::*;
412 use crate::test_util::memtable_util::{
413 self, collect_iter_timestamps, region_metadata_to_row_schema,
414 };
415
416 #[test]
417 fn test_memtable_sorted_input() {
418 write_iter_sorted_input(true);
419 write_iter_sorted_input(false);
420 }
421
422 fn write_iter_sorted_input(has_pk: bool) {
423 let metadata = if has_pk {
424 Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true))
425 } else {
426 Arc::new(memtable_util::metadata_with_primary_key(vec![], false))
427 };
428 let timestamps = (0..100).collect::<Vec<_>>();
429 let kvs =
430 memtable_util::build_key_values(&metadata, "hello".to_string(), 42, ×tamps, 1);
431 let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
432 let memtable = PartitionTreeMemtable::new(
433 1,
434 codec,
435 metadata.clone(),
436 None,
437 &PartitionTreeConfig::default(),
438 );
439 memtable.write(&kvs).unwrap();
440
441 let expected_ts = kvs
442 .iter()
443 .map(|kv| {
444 kv.timestamp()
445 .try_into_timestamp()
446 .unwrap()
447 .unwrap()
448 .value()
449 })
450 .collect::<Vec<_>>();
451
452 let iter = memtable.iter(None, None, None).unwrap();
453 let read = collect_iter_timestamps(iter);
454 assert_eq!(expected_ts, read);
455
456 let stats = memtable.stats();
457 assert!(stats.bytes_allocated() > 0);
458 assert_eq!(
459 Some((
460 Timestamp::new_millisecond(0),
461 Timestamp::new_millisecond(99)
462 )),
463 stats.time_range()
464 );
465 }
466
467 #[test]
468 fn test_memtable_unsorted_input() {
469 write_iter_unsorted_input(true);
470 write_iter_unsorted_input(false);
471 }
472
473 fn write_iter_unsorted_input(has_pk: bool) {
474 let metadata = if has_pk {
475 Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true))
476 } else {
477 Arc::new(memtable_util::metadata_with_primary_key(vec![], false))
478 };
479 let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
480 let memtable = PartitionTreeMemtable::new(
481 1,
482 codec,
483 metadata.clone(),
484 None,
485 &PartitionTreeConfig::default(),
486 );
487
488 let kvs = memtable_util::build_key_values(
489 &metadata,
490 "hello".to_string(),
491 0,
492 &[1, 3, 7, 5, 6],
493 0, );
495 memtable.write(&kvs).unwrap();
496
497 let kvs = memtable_util::build_key_values(
498 &metadata,
499 "hello".to_string(),
500 0,
501 &[5, 2, 4, 0, 7],
502 5, );
504 memtable.write(&kvs).unwrap();
505
506 let iter = memtable.iter(None, None, None).unwrap();
507 let read = collect_iter_timestamps(iter);
508 assert_eq!(vec![0, 1, 2, 3, 4, 5, 6, 7], read);
509
510 let iter = memtable.iter(None, None, None).unwrap();
511 let read = iter
512 .flat_map(|batch| {
513 batch
514 .unwrap()
515 .sequences()
516 .iter_data()
517 .collect::<Vec<_>>()
518 .into_iter()
519 })
520 .map(|v| v.unwrap())
521 .collect::<Vec<_>>();
522 assert_eq!(vec![8, 0, 6, 1, 7, 5, 4, 9], read);
523
524 let stats = memtable.stats();
525 assert!(stats.bytes_allocated() > 0);
526 assert_eq!(
527 Some((Timestamp::new_millisecond(0), Timestamp::new_millisecond(7))),
528 stats.time_range()
529 );
530 }
531
532 #[test]
533 fn test_memtable_projection() {
534 write_iter_projection(true);
535 write_iter_projection(false);
536 }
537
538 fn write_iter_projection(has_pk: bool) {
539 let metadata = if has_pk {
540 Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true))
541 } else {
542 Arc::new(memtable_util::metadata_with_primary_key(vec![], false))
543 };
544 let memtable = PartitionTreeMemtableBuilder::new(PartitionTreeConfig::default(), None)
546 .build(1, &metadata);
547
548 let expect = (0..100).collect::<Vec<_>>();
549 let kvs = memtable_util::build_key_values(&metadata, "hello".to_string(), 10, &expect, 1);
550 memtable.write(&kvs).unwrap();
551 let iter = memtable.iter(Some(&[3]), None, None).unwrap();
552
553 let mut v0_all = vec![];
554 for res in iter {
555 let batch = res.unwrap();
556 assert_eq!(1, batch.fields().len());
557 let v0 = batch
558 .fields()
559 .first()
560 .unwrap()
561 .data
562 .as_any()
563 .downcast_ref::<Int64Vector>()
564 .unwrap();
565 v0_all.extend(v0.iter_data().map(|v| v.unwrap()));
566 }
567 assert_eq!(expect, v0_all);
568 }
569
570 #[test]
571 fn test_write_iter_multi_keys() {
572 write_iter_multi_keys(1, 100);
573 write_iter_multi_keys(2, 100);
574 write_iter_multi_keys(4, 100);
575 write_iter_multi_keys(8, 5);
576 write_iter_multi_keys(2, 10);
577 }
578
579 fn write_iter_multi_keys(max_keys: usize, freeze_threshold: usize) {
580 let metadata = Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true));
581 let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
582 let memtable = PartitionTreeMemtable::new(
583 1,
584 codec,
585 metadata.clone(),
586 None,
587 &PartitionTreeConfig {
588 index_max_keys_per_shard: max_keys,
589 data_freeze_threshold: freeze_threshold,
590 ..Default::default()
591 },
592 );
593
594 let mut data = Vec::new();
595 for i in 0..4 {
597 for j in 0..4 {
598 let timestamps = [11, 13, 1, 5, 3, 7, 9];
600 let key = format!("a{j}");
601 let kvs =
602 memtable_util::build_key_values(&metadata, key.clone(), i, ×tamps, 0);
603 memtable.write(&kvs).unwrap();
604 for ts in timestamps {
605 data.push((i, key.clone(), ts));
606 }
607 }
608 for j in 0..4 {
609 let timestamps = [10, 2, 4, 8, 6];
611 let key = format!("a{j}");
612 let kvs =
613 memtable_util::build_key_values(&metadata, key.clone(), i, ×tamps, 200);
614 memtable.write(&kvs).unwrap();
615 for ts in timestamps {
616 data.push((i, key.clone(), ts));
617 }
618 }
619 }
620 data.sort_unstable();
621
622 let expect = data.into_iter().map(|x| x.2).collect::<Vec<_>>();
623 let iter = memtable.iter(None, None, None).unwrap();
624 let read = collect_iter_timestamps(iter);
625 assert_eq!(expect, read);
626 }
627
628 #[test]
629 fn test_memtable_filter() {
630 let metadata = Arc::new(memtable_util::metadata_with_primary_key(vec![0, 1], false));
631 let memtable = PartitionTreeMemtableBuilder::new(
633 PartitionTreeConfig {
634 index_max_keys_per_shard: 40,
635 ..Default::default()
636 },
637 None,
638 )
639 .build(1, &metadata);
640
641 for i in 0..100 {
642 let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
643 let kvs =
644 memtable_util::build_key_values(&metadata, "hello".to_string(), i, ×tamps, 1);
645 memtable.write(&kvs).unwrap();
646 }
647
648 for i in 0..100 {
649 let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
650 let expr = Expr::BinaryExpr(BinaryExpr {
651 left: Box::new(Expr::Column(Column::from_name("k1"))),
652 op: Operator::Eq,
653 right: Box::new((i as u32).lit()),
654 });
655 let iter = memtable
656 .iter(None, Some(Predicate::new(vec![expr])), None)
657 .unwrap();
658 let read = collect_iter_timestamps(iter);
659 assert_eq!(timestamps, read);
660 }
661 }
662
663 #[test]
664 fn test_deserialize_config() {
665 let config = PartitionTreeConfig {
666 dedup: false,
667 ..Default::default()
668 };
669 let json = serde_json::to_string(&config).unwrap();
671 let config: PartitionTreeConfig = serde_json::from_str(&json).unwrap();
672 assert!(config.dedup);
673 assert_eq!(PartitionTreeConfig::default(), config);
674 }
675
676 fn metadata_for_metric_engine() -> RegionMetadataRef {
677 let mut builder = RegionMetadataBuilder::new(RegionId::new(123, 456));
678 builder
679 .push_column_metadata(ColumnMetadata {
680 column_schema: ColumnSchema::new(
681 "__table_id",
682 ConcreteDataType::uint32_datatype(),
683 false,
684 ),
685 semantic_type: SemanticType::Tag,
686 column_id: 2147483652,
687 })
688 .push_column_metadata(ColumnMetadata {
689 column_schema: ColumnSchema::new(
690 "__tsid",
691 ConcreteDataType::uint64_datatype(),
692 false,
693 ),
694 semantic_type: SemanticType::Tag,
695 column_id: 2147483651,
696 })
697 .push_column_metadata(ColumnMetadata {
698 column_schema: ColumnSchema::new(
699 "test_label",
700 ConcreteDataType::string_datatype(),
701 false,
702 ),
703 semantic_type: SemanticType::Tag,
704 column_id: 2,
705 })
706 .push_column_metadata(ColumnMetadata {
707 column_schema: ColumnSchema::new(
708 greptime_timestamp(),
709 ConcreteDataType::timestamp_millisecond_datatype(),
710 false,
711 ),
712 semantic_type: SemanticType::Timestamp,
713 column_id: 0,
714 })
715 .push_column_metadata(ColumnMetadata {
716 column_schema: ColumnSchema::new(
717 greptime_value(),
718 ConcreteDataType::float64_datatype(),
719 true,
720 ),
721 semantic_type: SemanticType::Field,
722 column_id: 1,
723 })
724 .primary_key(vec![2147483652, 2147483651, 2]);
725 let region_metadata = builder.build().unwrap();
726 Arc::new(region_metadata)
727 }
728
729 fn build_key_values(
730 metadata: RegionMetadataRef,
731 labels: &[&str],
732 table_id: &[u32],
733 ts_id: &[u64],
734 ts: &[i64],
735 values: &[f64],
736 sequence: u64,
737 ) -> KeyValues {
738 let column_schema = region_metadata_to_row_schema(&metadata);
739
740 let rows = ts
741 .iter()
742 .zip(table_id.iter())
743 .zip(ts_id.iter())
744 .zip(labels.iter())
745 .zip(values.iter())
746 .map(|((((ts, table_id), ts_id), label), val)| {
747 row(vec![
748 ValueData::U32Value(*table_id),
749 ValueData::U64Value(*ts_id),
750 ValueData::StringValue(label.to_string()),
751 ValueData::TimestampMillisecondValue(*ts),
752 ValueData::F64Value(*val),
753 ])
754 })
755 .collect();
756 let mutation = api::v1::Mutation {
757 op_type: 1,
758 sequence,
759 rows: Some(Rows {
760 schema: column_schema,
761 rows,
762 }),
763 write_hint: None,
764 };
765 KeyValues::new(metadata.as_ref(), mutation).unwrap()
766 }
767
768 #[test]
769 fn test_write_freeze() {
770 let metadata = metadata_for_metric_engine();
771 let memtable = PartitionTreeMemtableBuilder::new(
772 PartitionTreeConfig {
773 index_max_keys_per_shard: 40,
774 ..Default::default()
775 },
776 None,
777 )
778 .build(1, &metadata);
779
780 let codec = DensePrimaryKeyCodec::new(&metadata);
781
782 memtable
783 .write(&build_key_values(
784 metadata.clone(),
785 &["daily", "10min", "daily", "10min"],
786 &[1025, 1025, 1025, 1025],
787 &[
788 16442255374049317291,
789 5686004715529701024,
790 16442255374049317291,
791 5686004715529701024,
792 ],
793 &[1712070000000, 1712717731000, 1712761200000, 1712761200000],
794 &[0.0, 0.0, 0.0, 0.0],
795 1,
796 ))
797 .unwrap();
798
799 memtable.freeze().unwrap();
800 let new_memtable = memtable.fork(2, &metadata);
801
802 new_memtable
803 .write(&build_key_values(
804 metadata.clone(),
805 &["10min"],
806 &[1025],
807 &[5686004715529701024],
808 &[1714643131000],
809 &[0.1],
810 2,
811 ))
812 .unwrap();
813
814 let mut reader = new_memtable.iter(None, None, None).unwrap();
815 let batch = reader.next().unwrap().unwrap();
816 let pk = codec.decode(batch.primary_key()).unwrap().into_dense();
817 if let Value::String(s) = &pk[2] {
818 assert_eq!("10min", s.as_utf8());
819 } else {
820 unreachable!()
821 }
822 }
823
824 fn kv_region_metadata() -> RegionMetadataRef {
825 let mut builder = RegionMetadataBuilder::new(RegionId::new(123, 456));
826 builder
827 .push_column_metadata(ColumnMetadata {
828 column_schema: ColumnSchema::new(
829 "ts",
830 ConcreteDataType::timestamp_millisecond_datatype(),
831 false,
832 ),
833 semantic_type: SemanticType::Timestamp,
834 column_id: 0,
835 })
836 .push_column_metadata(ColumnMetadata {
837 column_schema: ColumnSchema::new("k", ConcreteDataType::string_datatype(), false),
838 semantic_type: SemanticType::Tag,
839 column_id: 1,
840 })
841 .push_column_metadata(ColumnMetadata {
842 column_schema: ColumnSchema::new("v", ConcreteDataType::string_datatype(), false),
843 semantic_type: SemanticType::Field,
844 column_id: 2,
845 })
846 .primary_key(vec![1]);
847 let region_metadata = builder.build().unwrap();
848 Arc::new(region_metadata)
849 }
850
851 fn kv_column_schemas() -> Vec<api::v1::ColumnSchema> {
852 vec![
853 time_index_column_schema("ts", api::v1::ColumnDataType::TimestampMillisecond),
854 tag_column_schema("k", api::v1::ColumnDataType::String),
855 field_column_schema("v", api::v1::ColumnDataType::String),
856 ]
857 }
858
859 fn key_values<T: AsRef<str>>(
860 metadata: &RegionMetadataRef,
861 keys: impl Iterator<Item = T>,
862 ) -> KeyValues {
863 let rows = keys
864 .map(|c| {
865 row(vec![
866 ValueData::TimestampMillisecondValue(0),
867 ValueData::StringValue(c.as_ref().to_string()),
868 ValueData::StringValue(c.as_ref().to_string()),
869 ])
870 })
871 .collect();
872 let mutation = Mutation {
873 op_type: OpType::Put as i32,
874 sequence: 0,
875 rows: Some(Rows {
876 schema: kv_column_schemas(),
877 rows,
878 }),
879 write_hint: None,
880 };
881 KeyValues::new(metadata, mutation).unwrap()
882 }
883
884 fn collect_kvs(
885 iter: BoxedBatchIterator,
886 region_meta: &RegionMetadataRef,
887 ) -> HashMap<String, String> {
888 let decoder = DensePrimaryKeyCodec::new(region_meta);
889 let mut res = HashMap::new();
890 for v in iter {
891 let batch = v.unwrap();
892 let values = decoder.decode(batch.primary_key()).unwrap().into_dense();
893 let field_vector = batch.fields()[0]
894 .data
895 .as_any()
896 .downcast_ref::<StringVector>()
897 .unwrap();
898 for row in 0..batch.num_rows() {
899 res.insert(
900 values[0].as_string().unwrap(),
901 field_vector.get(row).as_string().unwrap(),
902 );
903 }
904 }
905 res
906 }
907
908 #[test]
909 fn test_reorder_insert_key_values() {
910 let metadata = kv_region_metadata();
911 let memtable = PartitionTreeMemtableBuilder::new(PartitionTreeConfig::default(), None)
912 .build(1, &metadata);
913
914 memtable
915 .write(&key_values(&metadata, ('a'..'h').map(|c| c.to_string())))
916 .unwrap();
917 memtable.freeze().unwrap();
918 assert_eq!(
919 collect_kvs(memtable.iter(None, None, None).unwrap(), &metadata),
920 ('a'..'h').map(|c| (c.to_string(), c.to_string())).collect()
921 );
922 let forked = memtable.fork(2, &metadata);
923
924 let keys = ["c", "f", "i", "h", "b", "e", "g"];
925 forked.write(&key_values(&metadata, keys.iter())).unwrap();
926 forked.freeze().unwrap();
927 assert_eq!(
928 collect_kvs(forked.iter(None, None, None).unwrap(), &metadata),
929 keys.iter()
930 .map(|c| (c.to_string(), c.to_string()))
931 .collect()
932 );
933
934 let forked2 = forked.fork(3, &metadata);
935
936 let keys = ["g", "e", "a", "f", "b", "c", "h"];
937 forked2.write(&key_values(&metadata, keys.iter())).unwrap();
938
939 let kvs = collect_kvs(forked2.iter(None, None, None).unwrap(), &metadata);
940 let expected = keys
941 .iter()
942 .map(|c| (c.to_string(), c.to_string()))
943 .collect::<HashMap<_, _>>();
944 assert_eq!(kvs, expected);
945 }
946
947 #[test]
948 fn test_build_record_batch_iter_from_memtable() {
949 let metadata = Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true));
950 let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
951 let memtable = PartitionTreeMemtable::new(
952 1,
953 codec,
954 metadata.clone(),
955 None,
956 &PartitionTreeConfig::default(),
957 );
958
959 let kvs =
960 memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &[1, 2, 3], 0);
961 memtable.write(&kvs).unwrap();
962
963 let read_column_ids: Vec<ColumnId> = metadata
964 .column_metadatas
965 .iter()
966 .map(|c| c.column_id)
967 .collect();
968 let ranges = memtable
969 .ranges(Some(&read_column_ids), RangesOptions::default())
970 .unwrap();
971 assert!(!ranges.ranges.is_empty());
972
973 let mut total_rows = 0;
974 for range in ranges.ranges.into_values() {
975 let mut iter = range.build_record_batch_iter(None, None).unwrap();
976 while let Some(rb) = iter.next().transpose().unwrap() {
977 total_rows += rb.num_rows();
978 let schema = rb.schema();
979 let column_names: Vec<_> =
980 schema.fields().iter().map(|f| f.name().as_str()).collect();
981 assert_eq!(
982 column_names,
983 vec![
984 "__table_id",
985 "k0",
986 "v0",
987 "v1",
988 "ts",
989 "__primary_key",
990 "__sequence",
991 "__op_type",
992 ]
993 );
994 }
995 }
996 assert_eq!(3, total_rows);
997 }
998
999 #[test]
1000 fn test_build_record_batch_iter_with_time_range() {
1001 let metadata = Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true));
1002 let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
1003 let memtable = PartitionTreeMemtable::new(
1004 1,
1005 codec,
1006 metadata.clone(),
1007 None,
1008 &PartitionTreeConfig::default(),
1009 );
1010
1011 let kvs = memtable_util::build_key_values(
1012 &metadata,
1013 "hello".to_string(),
1014 42,
1015 &[1, 2, 3, 4, 5],
1016 0,
1017 );
1018 memtable.write(&kvs).unwrap();
1019
1020 let read_column_ids: Vec<ColumnId> = metadata
1021 .column_metadatas
1022 .iter()
1023 .map(|c| c.column_id)
1024 .collect();
1025 let ranges = memtable
1026 .ranges(Some(&read_column_ids), RangesOptions::default())
1027 .unwrap();
1028 assert!(!ranges.ranges.is_empty());
1029
1030 let time_range = (Timestamp::new_millisecond(2), Timestamp::new_millisecond(4));
1031
1032 let mut total_rows = 0;
1033 let mut all_timestamps = Vec::new();
1034 for range in ranges.ranges.into_values() {
1035 let mut iter = range
1036 .build_record_batch_iter(Some(time_range), None)
1037 .unwrap();
1038 while let Some(rb) = iter.next().transpose().unwrap() {
1039 total_rows += rb.num_rows();
1040 let ts_col = rb
1042 .column_by_name("ts")
1043 .unwrap()
1044 .as_any()
1045 .downcast_ref::<datatypes::arrow::array::TimestampMillisecondArray>()
1046 .unwrap();
1047 for i in 0..ts_col.len() {
1048 all_timestamps.push(ts_col.value(i));
1049 }
1050 }
1051 }
1052 assert_eq!(3, total_rows);
1053 all_timestamps.sort();
1054 assert_eq!(vec![2, 3, 4], all_timestamps);
1055 }
1056}