1use std::collections::BTreeMap;
18use std::fmt;
19use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
20use std::sync::{Arc, Mutex};
21use std::time::Duration;
22
23pub use bulk::part::EncodedBulkPart;
24use bytes::Bytes;
25use common_time::Timestamp;
26use datatypes::arrow::datatypes::SchemaRef;
27use datatypes::arrow::record_batch::RecordBatch;
28use mito_codec::key_values::KeyValue;
29pub use mito_codec::key_values::KeyValues;
30use mito_codec::row_converter::{PrimaryKeyCodec, build_primary_key_codec};
31use snafu::ensure;
32use store_api::codec::PrimaryKeyEncoding;
33use store_api::metadata::RegionMetadataRef;
34use store_api::storage::{ColumnId, SequenceNumber, SequenceRange};
35
36use crate::config::MitoConfig;
37use crate::error::{Result, UnsupportedOperationSnafu};
38use crate::flush::WriteBufferManagerRef;
39use crate::memtable::bulk::{BulkMemtableBuilder, CompactDispatcher};
40use crate::memtable::time_series::TimeSeriesMemtableBuilder;
41use crate::metrics::WRITE_BUFFER_BYTES;
42use crate::read::Batch;
43use crate::read::batch_adapter::BatchToRecordBatchAdapter;
44use crate::read::prune::PruneTimeIterator;
45use crate::read::scan_region::PredicateGroup;
46use crate::region::options::{MemtableOptions, MergeMode, RegionOptions};
47use crate::sst::FormatType;
48use crate::sst::file::FileTimeRange;
49use crate::sst::parquet::SstInfo;
50use crate::sst::parquet::file_range::PreFilterMode;
51
52mod builder;
53pub mod bulk;
54pub mod simple_bulk_memtable;
55mod stats;
56pub mod time_partition;
57pub mod time_series;
58pub(crate) mod version;
59
60pub use bulk::part::{
61 BulkPart, BulkPartEncoder, BulkPartMeta, UnorderedPart, record_batch_estimated_size,
62 sort_primary_key_record_batch,
63};
64#[cfg(any(test, feature = "test"))]
65pub use time_partition::filter_record_batch;
66
67pub type MemtableId = u32;
71
72#[derive(Clone)]
74pub struct RangesOptions {
75 pub for_flush: bool,
77 pub pre_filter_mode: PreFilterMode,
79 pub predicate: PredicateGroup,
81 pub sequence: Option<SequenceRange>,
83}
84
85impl Default for RangesOptions {
86 fn default() -> Self {
87 Self {
88 for_flush: false,
89 pre_filter_mode: PreFilterMode::All,
90 predicate: PredicateGroup::default(),
91 sequence: None,
92 }
93 }
94}
95
96impl RangesOptions {
97 pub fn for_flush() -> Self {
99 Self {
100 for_flush: true,
101 pre_filter_mode: PreFilterMode::All,
102 predicate: PredicateGroup::default(),
103 sequence: None,
104 }
105 }
106
107 #[must_use]
109 pub fn with_pre_filter_mode(mut self, pre_filter_mode: PreFilterMode) -> Self {
110 self.pre_filter_mode = pre_filter_mode;
111 self
112 }
113
114 #[must_use]
116 pub fn with_predicate(mut self, predicate: PredicateGroup) -> Self {
117 self.predicate = predicate;
118 self
119 }
120
121 #[must_use]
123 pub fn with_sequence(mut self, sequence: Option<SequenceRange>) -> Self {
124 self.sequence = sequence;
125 self
126 }
127}
128
129#[derive(Debug, Default, Clone)]
130pub struct MemtableStats {
131 pub estimated_bytes: usize,
133 pub time_range: Option<(Timestamp, Timestamp)>,
136 pub num_rows: usize,
138 pub num_ranges: usize,
140 pub max_sequence: SequenceNumber,
142 pub series_count: usize,
144}
145
146impl MemtableStats {
147 #[cfg(any(test, feature = "test"))]
149 pub fn with_time_range(mut self, time_range: Option<(Timestamp, Timestamp)>) -> Self {
150 self.time_range = time_range;
151 self
152 }
153
154 #[cfg(feature = "test")]
155 pub fn with_max_sequence(mut self, max_sequence: SequenceNumber) -> Self {
156 self.max_sequence = max_sequence;
157 self
158 }
159
160 pub fn bytes_allocated(&self) -> usize {
162 self.estimated_bytes
163 }
164
165 pub fn time_range(&self) -> Option<(Timestamp, Timestamp)> {
167 self.time_range
168 }
169
170 pub fn num_rows(&self) -> usize {
172 self.num_rows
173 }
174
175 pub fn num_ranges(&self) -> usize {
177 self.num_ranges
178 }
179
180 pub fn max_sequence(&self) -> SequenceNumber {
182 self.max_sequence
183 }
184
185 pub fn series_count(&self) -> usize {
187 self.series_count
188 }
189}
190
191pub type BoxedBatchIterator = Box<dyn Iterator<Item = Result<Batch>> + Send>;
192
193pub type BoxedRecordBatchIterator = Box<dyn Iterator<Item = Result<RecordBatch>> + Send>;
194
195#[derive(Default)]
197pub struct MemtableRanges {
198 pub ranges: BTreeMap<usize, MemtableRange>,
200}
201
202impl MemtableRanges {
203 pub fn num_rows(&self) -> usize {
205 self.ranges.values().map(|r| r.stats().num_rows()).sum()
206 }
207
208 pub fn series_count(&self) -> usize {
210 self.ranges.values().map(|r| r.stats().series_count()).sum()
211 }
212
213 pub fn max_sequence(&self) -> SequenceNumber {
215 self.ranges
216 .values()
217 .map(|r| r.stats().max_sequence())
218 .max()
219 .unwrap_or(0)
220 }
221}
222
223impl IterBuilder for MemtableRanges {
224 fn build(&self, _metrics: Option<MemScanMetrics>) -> Result<BoxedBatchIterator> {
225 ensure!(
226 self.ranges.len() == 1,
227 UnsupportedOperationSnafu {
228 err_msg: format!(
229 "Building an iterator from MemtableRanges expects 1 range, but got {}",
230 self.ranges.len()
231 ),
232 }
233 );
234
235 self.ranges.values().next().unwrap().build_iter()
236 }
237
238 fn is_record_batch(&self) -> bool {
239 self.ranges.values().all(|range| range.is_record_batch())
240 }
241}
242
243pub trait Memtable: Send + Sync + fmt::Debug {
245 fn id(&self) -> MemtableId;
247
248 fn write(&self, kvs: &KeyValues) -> Result<()>;
250
251 fn write_one(&self, key_value: KeyValue) -> Result<()>;
253
254 fn write_bulk(&self, part: crate::memtable::bulk::part::BulkPart) -> Result<()>;
256
257 fn ranges(
261 &self,
262 projection: Option<&[ColumnId]>,
263 options: RangesOptions,
264 ) -> Result<MemtableRanges>;
265
266 fn is_empty(&self) -> bool;
268
269 fn freeze(&self) -> Result<()>;
271
272 fn stats(&self) -> MemtableStats;
274
275 fn fork(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef;
279
280 fn compact(&self, for_flush: bool) -> Result<()> {
284 let _ = for_flush;
285 Ok(())
286 }
287}
288
289pub type MemtableRef = Arc<dyn Memtable>;
290
291pub trait MemtableBuilder: Send + Sync + fmt::Debug {
293 fn build(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef;
295
296 fn use_bulk_insert(&self, metadata: &RegionMetadataRef) -> bool {
298 let _metadata = metadata;
299 false
300 }
301}
302
303pub type MemtableBuilderRef = Arc<dyn MemtableBuilder>;
304
305#[derive(Default)]
307pub struct AllocTracker {
308 write_buffer_manager: Option<WriteBufferManagerRef>,
309 bytes_allocated: AtomicUsize,
311 is_done_allocating: AtomicBool,
313}
314
315impl fmt::Debug for AllocTracker {
316 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
317 f.debug_struct("AllocTracker")
318 .field("bytes_allocated", &self.bytes_allocated)
319 .field("is_done_allocating", &self.is_done_allocating)
320 .finish()
321 }
322}
323
324impl AllocTracker {
325 pub fn new(write_buffer_manager: Option<WriteBufferManagerRef>) -> AllocTracker {
327 AllocTracker {
328 write_buffer_manager,
329 bytes_allocated: AtomicUsize::new(0),
330 is_done_allocating: AtomicBool::new(false),
331 }
332 }
333
334 pub(crate) fn on_allocation(&self, bytes: usize) {
336 self.bytes_allocated.fetch_add(bytes, Ordering::Relaxed);
337 WRITE_BUFFER_BYTES.add(bytes as i64);
338 if let Some(write_buffer_manager) = &self.write_buffer_manager {
339 write_buffer_manager.reserve_mem(bytes);
340 }
341 }
342
343 pub(crate) fn done_allocating(&self) {
348 if let Some(write_buffer_manager) = &self.write_buffer_manager
349 && self
350 .is_done_allocating
351 .compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed)
352 .is_ok()
353 {
354 write_buffer_manager.schedule_free_mem(self.bytes_allocated.load(Ordering::Relaxed));
355 }
356 }
357
358 pub(crate) fn bytes_allocated(&self) -> usize {
360 self.bytes_allocated.load(Ordering::Relaxed)
361 }
362
363 pub(crate) fn write_buffer_manager(&self) -> Option<WriteBufferManagerRef> {
365 self.write_buffer_manager.clone()
366 }
367}
368
369impl Drop for AllocTracker {
370 fn drop(&mut self) {
371 if !self.is_done_allocating.load(Ordering::Relaxed) {
372 self.done_allocating();
373 }
374
375 let bytes_allocated = self.bytes_allocated.load(Ordering::Relaxed);
376 WRITE_BUFFER_BYTES.sub(bytes_allocated as i64);
377
378 if let Some(write_buffer_manager) = &self.write_buffer_manager {
380 write_buffer_manager.free_mem(bytes_allocated);
381 }
382 }
383}
384
385#[derive(Clone)]
387pub(crate) struct MemtableBuilderProvider {
388 write_buffer_manager: Option<WriteBufferManagerRef>,
389 config: Arc<MitoConfig>,
390 compact_dispatcher: Arc<CompactDispatcher>,
391}
392
393impl MemtableBuilderProvider {
394 pub(crate) fn new(
395 write_buffer_manager: Option<WriteBufferManagerRef>,
396 config: Arc<MitoConfig>,
397 ) -> Self {
398 let compact_dispatcher =
399 Arc::new(CompactDispatcher::new(config.max_background_compactions));
400
401 Self {
402 write_buffer_manager,
403 config,
404 compact_dispatcher,
405 }
406 }
407
408 pub(crate) fn builder_for_options(&self, options: &RegionOptions) -> MemtableBuilderRef {
409 let dedup = options.need_dedup();
410 let merge_mode = options.merge_mode();
411 let primary_key_encoding = options.primary_key_encoding();
412 let flat_format = options
413 .sst_format
414 .map(|format| format == FormatType::Flat)
415 .unwrap_or(self.config.default_flat_format);
416 if flat_format {
417 if options.memtable.is_some()
418 && !matches!(&options.memtable, Some(MemtableOptions::Bulk(_)))
419 {
420 common_telemetry::info!(
421 "Overriding memtable config, use BulkMemtable under flat format"
422 );
423 }
424
425 return Arc::new(self.bulk_memtable_builder(dedup, merge_mode, options));
426 }
427
428 if primary_key_encoding == PrimaryKeyEncoding::Sparse {
429 if options.memtable.is_some()
430 && !matches!(&options.memtable, Some(MemtableOptions::Bulk(_)))
431 {
432 common_telemetry::info!(
433 "Overriding memtable config, use BulkMemtable for sparse primary key encoding"
434 );
435 }
436 return Arc::new(self.bulk_memtable_builder(dedup, merge_mode, options));
437 }
438
439 match &options.memtable {
441 Some(MemtableOptions::Bulk(config)) => Arc::new(
442 BulkMemtableBuilder::new(self.write_buffer_manager.clone(), !dedup, merge_mode)
443 .with_config(config.clone())
444 .with_compact_dispatcher(self.compact_dispatcher.clone()),
445 ),
446 Some(MemtableOptions::TimeSeries) => Arc::new(TimeSeriesMemtableBuilder::new(
447 self.write_buffer_manager.clone(),
448 dedup,
449 merge_mode,
450 )),
451 None => self.default_primary_key_memtable_builder(dedup, merge_mode),
452 }
453 }
454
455 fn bulk_memtable_builder(
456 &self,
457 dedup: bool,
458 merge_mode: MergeMode,
459 options: &RegionOptions,
460 ) -> BulkMemtableBuilder {
461 let mut builder = BulkMemtableBuilder::new(
462 self.write_buffer_manager.clone(),
463 !dedup, merge_mode,
465 )
466 .with_compact_dispatcher(self.compact_dispatcher.clone());
467
468 if let Some(MemtableOptions::Bulk(config)) = &options.memtable {
469 builder = builder.with_config(config.clone());
470 }
471
472 builder
473 }
474
475 fn default_primary_key_memtable_builder(
476 &self,
477 dedup: bool,
478 merge_mode: MergeMode,
479 ) -> MemtableBuilderRef {
480 Arc::new(TimeSeriesMemtableBuilder::new(
481 self.write_buffer_manager.clone(),
482 dedup,
483 merge_mode,
484 ))
485 }
486}
487
488#[derive(Clone, Default)]
490pub struct MemScanMetrics(Arc<Mutex<MemScanMetricsData>>);
491
492impl MemScanMetrics {
493 pub(crate) fn merge_inner(&self, inner: &MemScanMetricsData) {
495 let mut metrics = self.0.lock().unwrap();
496 metrics.total_series += inner.total_series;
497 metrics.num_rows += inner.num_rows;
498 metrics.num_batches += inner.num_batches;
499 metrics.scan_cost += inner.scan_cost;
500 metrics.prefilter_cost += inner.prefilter_cost;
501 metrics.prefilter_rows_filtered += inner.prefilter_rows_filtered;
502 }
503
504 pub(crate) fn data(&self) -> MemScanMetricsData {
506 self.0.lock().unwrap().clone()
507 }
508}
509
510#[derive(Clone, Default)]
511pub(crate) struct MemScanMetricsData {
512 pub(crate) total_series: usize,
514 pub(crate) num_rows: usize,
516 pub(crate) num_batches: usize,
518 pub(crate) scan_cost: Duration,
520 pub(crate) prefilter_cost: Duration,
522 pub(crate) prefilter_rows_filtered: usize,
524}
525
526pub struct EncodedRange {
528 pub data: Bytes,
530 pub sst_info: SstInfo,
532}
533
534pub trait IterBuilder: Send + Sync {
537 fn build(&self, metrics: Option<MemScanMetrics>) -> Result<BoxedBatchIterator>;
539
540 fn is_record_batch(&self) -> bool {
542 false
543 }
544
545 fn build_record_batch(
549 &self,
550 time_range: Option<(Timestamp, Timestamp)>,
551 metrics: Option<MemScanMetrics>,
552 ) -> Result<BoxedRecordBatchIterator> {
553 let _metrics = metrics;
554 let _ = time_range;
555 UnsupportedOperationSnafu {
556 err_msg: "Record batch iterator is not supported by this memtable",
557 }
558 .fail()
559 }
560
561 fn record_batch_schema_hint(&self) -> Option<SchemaRef> {
563 None
564 }
565
566 fn encoded_range(&self) -> Option<EncodedRange> {
568 None
569 }
570}
571
572pub type BoxedIterBuilder = Box<dyn IterBuilder>;
573
574pub fn read_column_ids_from_projection(
579 metadata: &RegionMetadataRef,
580 projection: Option<&[ColumnId]>,
581) -> Vec<ColumnId> {
582 if let Some(projection) = projection {
583 projection.to_vec()
584 } else {
585 metadata
586 .column_metadatas
587 .iter()
588 .map(|c| c.column_id)
589 .collect()
590 }
591}
592
593pub struct BatchToRecordBatchContext {
595 metadata: RegionMetadataRef,
596 codec: Arc<dyn PrimaryKeyCodec>,
597 read_column_ids: Vec<ColumnId>,
598}
599
600impl BatchToRecordBatchContext {
601 pub fn new(metadata: RegionMetadataRef, mut read_column_ids: Vec<ColumnId>) -> Self {
603 if read_column_ids.is_empty() {
604 read_column_ids.push(metadata.time_index_column().column_id);
605 }
606
607 let codec = build_primary_key_codec(&metadata);
608 Self {
609 metadata,
610 codec,
611 read_column_ids,
612 }
613 }
614
615 fn adapt_iter(&self, iter: BoxedBatchIterator) -> BoxedRecordBatchIterator {
616 Box::new(BatchToRecordBatchAdapter::new(
617 iter,
618 self.metadata.clone(),
619 self.codec.clone(),
620 &self.read_column_ids,
621 ))
622 }
623}
624
625pub struct MemtableRangeContext {
627 id: MemtableId,
629 builder: BoxedIterBuilder,
631 predicate: PredicateGroup,
633 batch_to_record_batch: Option<Arc<BatchToRecordBatchContext>>,
635}
636
637pub type MemtableRangeContextRef = Arc<MemtableRangeContext>;
638
639impl MemtableRangeContext {
640 pub fn new(id: MemtableId, builder: BoxedIterBuilder, predicate: PredicateGroup) -> Self {
642 Self::new_with_batch_to_record_batch(id, builder, predicate, None)
643 }
644
645 pub fn new_with_batch_to_record_batch(
647 id: MemtableId,
648 builder: BoxedIterBuilder,
649 predicate: PredicateGroup,
650 batch_to_record_batch: Option<Arc<BatchToRecordBatchContext>>,
651 ) -> Self {
652 Self {
653 id,
654 builder,
655 predicate,
656 batch_to_record_batch,
657 }
658 }
659}
660
661#[derive(Clone)]
663pub struct MemtableRange {
664 context: MemtableRangeContextRef,
666 stats: MemtableStats,
668}
669
670impl MemtableRange {
671 pub fn new(context: MemtableRangeContextRef, stats: MemtableStats) -> Self {
673 Self { context, stats }
674 }
675
676 pub fn stats(&self) -> &MemtableStats {
678 &self.stats
679 }
680
681 pub fn id(&self) -> MemtableId {
683 self.context.id
684 }
685
686 pub fn build_prune_iter(
690 &self,
691 time_range: FileTimeRange,
692 metrics: Option<MemScanMetrics>,
693 ) -> Result<BoxedBatchIterator> {
694 let iter = self.context.builder.build(metrics)?;
695 let time_filters = self.context.predicate.time_filters();
696 Ok(Box::new(PruneTimeIterator::new(
697 iter,
698 time_range,
699 time_filters,
700 )))
701 }
702
703 pub fn build_iter(&self) -> Result<BoxedBatchIterator> {
705 self.context.builder.build(None)
706 }
707
708 pub fn build_record_batch_iter(
713 &self,
714 time_range: Option<FileTimeRange>,
715 metrics: Option<MemScanMetrics>,
716 ) -> Result<BoxedRecordBatchIterator> {
717 if self.context.builder.is_record_batch() {
718 return self.context.builder.build_record_batch(time_range, metrics);
719 }
720
721 if let Some(context) = self.context.batch_to_record_batch.as_ref() {
722 let iter = self.context.builder.build(metrics)?;
723 let iter: BoxedBatchIterator = if let Some(time_range) = time_range {
724 let time_filters = self.context.predicate.time_filters();
725 Box::new(PruneTimeIterator::new(iter, time_range, time_filters))
726 } else {
727 iter
728 };
729 return Ok(context.adapt_iter(iter));
730 }
731
732 UnsupportedOperationSnafu {
733 err_msg: "Record batch iterator is not supported by this memtable",
734 }
735 .fail()
736 }
737
738 pub fn record_batch_schema_hint(&self) -> Option<SchemaRef> {
740 self.context.builder.record_batch_schema_hint()
741 }
742
743 pub fn is_record_batch(&self) -> bool {
745 self.context.builder.is_record_batch()
746 }
747
748 pub fn num_rows(&self) -> usize {
749 self.stats.num_rows
750 }
751
752 pub fn encoded(&self) -> Option<EncodedRange> {
754 self.context.builder.encoded_range()
755 }
756}
757
758#[cfg(test)]
759mod tests {
760 use std::sync::Arc;
761
762 use super::*;
763 use crate::flush::{WriteBufferManager, WriteBufferManagerImpl};
764 use crate::memtable::bulk::BulkMemtableConfig;
765
766 #[test]
767 fn test_alloc_tracker_without_manager() {
768 let tracker = AllocTracker::new(None);
769 assert_eq!(0, tracker.bytes_allocated());
770 tracker.on_allocation(100);
771 assert_eq!(100, tracker.bytes_allocated());
772 tracker.on_allocation(200);
773 assert_eq!(300, tracker.bytes_allocated());
774
775 tracker.done_allocating();
776 assert_eq!(300, tracker.bytes_allocated());
777 }
778
779 #[test]
780 fn test_alloc_tracker_with_manager() {
781 let manager = Arc::new(WriteBufferManagerImpl::new(1000));
782 {
783 let tracker = AllocTracker::new(Some(manager.clone() as WriteBufferManagerRef));
784
785 tracker.on_allocation(100);
786 assert_eq!(100, tracker.bytes_allocated());
787 assert_eq!(100, manager.memory_usage());
788 assert_eq!(100, manager.mutable_usage());
789
790 for _ in 0..2 {
791 tracker.done_allocating();
793 assert_eq!(100, manager.memory_usage());
794 assert_eq!(0, manager.mutable_usage());
795 }
796 }
797
798 assert_eq!(0, manager.memory_usage());
799 assert_eq!(0, manager.mutable_usage());
800 }
801
802 #[test]
803 fn test_alloc_tracker_without_done_allocating() {
804 let manager = Arc::new(WriteBufferManagerImpl::new(1000));
805 {
806 let tracker = AllocTracker::new(Some(manager.clone() as WriteBufferManagerRef));
807
808 tracker.on_allocation(100);
809 assert_eq!(100, tracker.bytes_allocated());
810 assert_eq!(100, manager.memory_usage());
811 assert_eq!(100, manager.mutable_usage());
812 }
813
814 assert_eq!(0, manager.memory_usage());
815 assert_eq!(0, manager.mutable_usage());
816 }
817
818 #[test]
819 fn test_forced_bulk_memtable_preserves_bulk_config() {
820 let provider = MemtableBuilderProvider::new(None, Arc::new(MitoConfig::default()));
821 let config = BulkMemtableConfig {
822 merge_threshold: 7,
823 encode_row_threshold: 11,
824 encode_bytes_threshold: 13,
825 max_merge_groups: 17,
826 };
827 let options = RegionOptions {
828 memtable: Some(MemtableOptions::Bulk(config.clone())),
829 primary_key_encoding: Some(PrimaryKeyEncoding::Sparse),
830 ..Default::default()
831 };
832
833 let builder =
834 provider.bulk_memtable_builder(options.need_dedup(), options.merge_mode(), &options);
835
836 assert_eq!(&config, builder.config());
837 }
838}