1use std::collections::BTreeMap;
18use std::fmt;
19use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
20use std::sync::{Arc, Mutex};
21use std::time::Duration;
22
23pub use bulk::part::EncodedBulkPart;
24use bytes::Bytes;
25use common_time::Timestamp;
26use datatypes::arrow::record_batch::RecordBatch;
27use mito_codec::key_values::KeyValue;
28pub use mito_codec::key_values::KeyValues;
29use mito_codec::row_converter::{PrimaryKeyCodec, build_primary_key_codec};
30use snafu::ensure;
31use store_api::codec::PrimaryKeyEncoding;
32use store_api::metadata::RegionMetadataRef;
33use store_api::storage::{ColumnId, SequenceNumber, SequenceRange};
34
35use crate::config::MitoConfig;
36use crate::error::{Result, UnsupportedOperationSnafu};
37use crate::flush::WriteBufferManagerRef;
38use crate::memtable::bulk::{BulkMemtableBuilder, CompactDispatcher};
39use crate::memtable::time_series::TimeSeriesMemtableBuilder;
40use crate::metrics::WRITE_BUFFER_BYTES;
41use crate::read::Batch;
42use crate::read::batch_adapter::BatchToRecordBatchAdapter;
43use crate::read::prune::PruneTimeIterator;
44use crate::read::scan_region::PredicateGroup;
45use crate::region::options::{MemtableOptions, MergeMode, RegionOptions};
46use crate::sst::FormatType;
47use crate::sst::file::FileTimeRange;
48use crate::sst::parquet::SstInfo;
49use crate::sst::parquet::file_range::PreFilterMode;
50
51mod builder;
52pub mod bulk;
53pub mod simple_bulk_memtable;
54mod stats;
55pub mod time_partition;
56pub mod time_series;
57pub(crate) mod version;
58
59pub use bulk::part::{
60 BulkPart, BulkPartEncoder, BulkPartMeta, UnorderedPart, record_batch_estimated_size,
61 sort_primary_key_record_batch,
62};
63#[cfg(any(test, feature = "test"))]
64pub use time_partition::filter_record_batch;
65
66pub type MemtableId = u32;
70
71#[derive(Clone)]
73pub struct RangesOptions {
74 pub for_flush: bool,
76 pub pre_filter_mode: PreFilterMode,
78 pub predicate: PredicateGroup,
80 pub sequence: Option<SequenceRange>,
82}
83
84impl Default for RangesOptions {
85 fn default() -> Self {
86 Self {
87 for_flush: false,
88 pre_filter_mode: PreFilterMode::All,
89 predicate: PredicateGroup::default(),
90 sequence: None,
91 }
92 }
93}
94
95impl RangesOptions {
96 pub fn for_flush() -> Self {
98 Self {
99 for_flush: true,
100 pre_filter_mode: PreFilterMode::All,
101 predicate: PredicateGroup::default(),
102 sequence: None,
103 }
104 }
105
106 #[must_use]
108 pub fn with_pre_filter_mode(mut self, pre_filter_mode: PreFilterMode) -> Self {
109 self.pre_filter_mode = pre_filter_mode;
110 self
111 }
112
113 #[must_use]
115 pub fn with_predicate(mut self, predicate: PredicateGroup) -> Self {
116 self.predicate = predicate;
117 self
118 }
119
120 #[must_use]
122 pub fn with_sequence(mut self, sequence: Option<SequenceRange>) -> Self {
123 self.sequence = sequence;
124 self
125 }
126}
127
128#[derive(Debug, Default, Clone)]
129pub struct MemtableStats {
130 pub estimated_bytes: usize,
132 pub time_range: Option<(Timestamp, Timestamp)>,
135 pub num_rows: usize,
137 pub num_ranges: usize,
139 pub max_sequence: SequenceNumber,
141 pub series_count: usize,
143}
144
145impl MemtableStats {
146 #[cfg(any(test, feature = "test"))]
148 pub fn with_time_range(mut self, time_range: Option<(Timestamp, Timestamp)>) -> Self {
149 self.time_range = time_range;
150 self
151 }
152
153 #[cfg(feature = "test")]
154 pub fn with_max_sequence(mut self, max_sequence: SequenceNumber) -> Self {
155 self.max_sequence = max_sequence;
156 self
157 }
158
159 pub fn bytes_allocated(&self) -> usize {
161 self.estimated_bytes
162 }
163
164 pub fn time_range(&self) -> Option<(Timestamp, Timestamp)> {
166 self.time_range
167 }
168
169 pub fn num_rows(&self) -> usize {
171 self.num_rows
172 }
173
174 pub fn num_ranges(&self) -> usize {
176 self.num_ranges
177 }
178
179 pub fn max_sequence(&self) -> SequenceNumber {
181 self.max_sequence
182 }
183
184 pub fn series_count(&self) -> usize {
186 self.series_count
187 }
188}
189
190pub type BoxedBatchIterator = Box<dyn Iterator<Item = Result<Batch>> + Send>;
191
192pub type BoxedRecordBatchIterator = Box<dyn Iterator<Item = Result<RecordBatch>> + Send>;
193
194#[derive(Default)]
196pub struct MemtableRanges {
197 pub ranges: BTreeMap<usize, MemtableRange>,
199}
200
201impl MemtableRanges {
202 pub fn num_rows(&self) -> usize {
204 self.ranges.values().map(|r| r.stats().num_rows()).sum()
205 }
206
207 pub fn series_count(&self) -> usize {
209 self.ranges.values().map(|r| r.stats().series_count()).sum()
210 }
211
212 pub fn max_sequence(&self) -> SequenceNumber {
214 self.ranges
215 .values()
216 .map(|r| r.stats().max_sequence())
217 .max()
218 .unwrap_or(0)
219 }
220}
221
222impl IterBuilder for MemtableRanges {
223 fn build(&self, _metrics: Option<MemScanMetrics>) -> Result<BoxedBatchIterator> {
224 ensure!(
225 self.ranges.len() == 1,
226 UnsupportedOperationSnafu {
227 err_msg: format!(
228 "Building an iterator from MemtableRanges expects 1 range, but got {}",
229 self.ranges.len()
230 ),
231 }
232 );
233
234 self.ranges.values().next().unwrap().build_iter()
235 }
236
237 fn is_record_batch(&self) -> bool {
238 self.ranges.values().all(|range| range.is_record_batch())
239 }
240}
241
242pub trait Memtable: Send + Sync + fmt::Debug {
244 fn id(&self) -> MemtableId;
246
247 fn write(&self, kvs: &KeyValues) -> Result<()>;
249
250 fn write_one(&self, key_value: KeyValue) -> Result<()>;
252
253 fn write_bulk(&self, part: crate::memtable::bulk::part::BulkPart) -> Result<()>;
255
256 fn ranges(
260 &self,
261 projection: Option<&[ColumnId]>,
262 options: RangesOptions,
263 ) -> Result<MemtableRanges>;
264
265 fn is_empty(&self) -> bool;
267
268 fn freeze(&self) -> Result<()>;
270
271 fn stats(&self) -> MemtableStats;
273
274 fn fork(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef;
278
279 fn compact(&self, for_flush: bool) -> Result<()> {
283 let _ = for_flush;
284 Ok(())
285 }
286}
287
288pub type MemtableRef = Arc<dyn Memtable>;
289
290pub trait MemtableBuilder: Send + Sync + fmt::Debug {
292 fn build(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef;
294
295 fn use_bulk_insert(&self, metadata: &RegionMetadataRef) -> bool {
297 let _metadata = metadata;
298 false
299 }
300}
301
302pub type MemtableBuilderRef = Arc<dyn MemtableBuilder>;
303
304#[derive(Default)]
306pub struct AllocTracker {
307 write_buffer_manager: Option<WriteBufferManagerRef>,
308 bytes_allocated: AtomicUsize,
310 is_done_allocating: AtomicBool,
312}
313
314impl fmt::Debug for AllocTracker {
315 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
316 f.debug_struct("AllocTracker")
317 .field("bytes_allocated", &self.bytes_allocated)
318 .field("is_done_allocating", &self.is_done_allocating)
319 .finish()
320 }
321}
322
323impl AllocTracker {
324 pub fn new(write_buffer_manager: Option<WriteBufferManagerRef>) -> AllocTracker {
326 AllocTracker {
327 write_buffer_manager,
328 bytes_allocated: AtomicUsize::new(0),
329 is_done_allocating: AtomicBool::new(false),
330 }
331 }
332
333 pub(crate) fn on_allocation(&self, bytes: usize) {
335 self.bytes_allocated.fetch_add(bytes, Ordering::Relaxed);
336 WRITE_BUFFER_BYTES.add(bytes as i64);
337 if let Some(write_buffer_manager) = &self.write_buffer_manager {
338 write_buffer_manager.reserve_mem(bytes);
339 }
340 }
341
342 pub(crate) fn done_allocating(&self) {
347 if let Some(write_buffer_manager) = &self.write_buffer_manager
348 && self
349 .is_done_allocating
350 .compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed)
351 .is_ok()
352 {
353 write_buffer_manager.schedule_free_mem(self.bytes_allocated.load(Ordering::Relaxed));
354 }
355 }
356
357 pub(crate) fn bytes_allocated(&self) -> usize {
359 self.bytes_allocated.load(Ordering::Relaxed)
360 }
361
362 pub(crate) fn write_buffer_manager(&self) -> Option<WriteBufferManagerRef> {
364 self.write_buffer_manager.clone()
365 }
366}
367
368impl Drop for AllocTracker {
369 fn drop(&mut self) {
370 if !self.is_done_allocating.load(Ordering::Relaxed) {
371 self.done_allocating();
372 }
373
374 let bytes_allocated = self.bytes_allocated.load(Ordering::Relaxed);
375 WRITE_BUFFER_BYTES.sub(bytes_allocated as i64);
376
377 if let Some(write_buffer_manager) = &self.write_buffer_manager {
379 write_buffer_manager.free_mem(bytes_allocated);
380 }
381 }
382}
383
384#[derive(Clone)]
386pub(crate) struct MemtableBuilderProvider {
387 write_buffer_manager: Option<WriteBufferManagerRef>,
388 config: Arc<MitoConfig>,
389 compact_dispatcher: Arc<CompactDispatcher>,
390}
391
392impl MemtableBuilderProvider {
393 pub(crate) fn new(
394 write_buffer_manager: Option<WriteBufferManagerRef>,
395 config: Arc<MitoConfig>,
396 ) -> Self {
397 let compact_dispatcher =
398 Arc::new(CompactDispatcher::new(config.max_background_compactions));
399
400 Self {
401 write_buffer_manager,
402 config,
403 compact_dispatcher,
404 }
405 }
406
407 pub(crate) fn builder_for_options(&self, options: &RegionOptions) -> MemtableBuilderRef {
408 let dedup = options.need_dedup();
409 let merge_mode = options.merge_mode();
410 let primary_key_encoding = options.primary_key_encoding();
411 let flat_format = options
412 .sst_format
413 .map(|format| format == FormatType::Flat)
414 .unwrap_or(self.config.default_flat_format);
415 if flat_format {
416 if options.memtable.is_some()
417 && !matches!(&options.memtable, Some(MemtableOptions::Bulk(_)))
418 {
419 common_telemetry::info!(
420 "Overriding memtable config, use BulkMemtable under flat format"
421 );
422 }
423
424 return Arc::new(self.bulk_memtable_builder(dedup, merge_mode, options));
425 }
426
427 if primary_key_encoding == PrimaryKeyEncoding::Sparse {
428 if options.memtable.is_some()
429 && !matches!(&options.memtable, Some(MemtableOptions::Bulk(_)))
430 {
431 common_telemetry::info!(
432 "Overriding memtable config, use BulkMemtable for sparse primary key encoding"
433 );
434 }
435 return Arc::new(self.bulk_memtable_builder(dedup, merge_mode, options));
436 }
437
438 match &options.memtable {
440 Some(MemtableOptions::Bulk(config)) => Arc::new(
441 BulkMemtableBuilder::new(self.write_buffer_manager.clone(), !dedup, merge_mode)
442 .with_config(config.clone())
443 .with_compact_dispatcher(self.compact_dispatcher.clone()),
444 ),
445 Some(MemtableOptions::TimeSeries) => Arc::new(TimeSeriesMemtableBuilder::new(
446 self.write_buffer_manager.clone(),
447 dedup,
448 merge_mode,
449 )),
450 None => self.default_primary_key_memtable_builder(dedup, merge_mode),
451 }
452 }
453
454 fn bulk_memtable_builder(
455 &self,
456 dedup: bool,
457 merge_mode: MergeMode,
458 options: &RegionOptions,
459 ) -> BulkMemtableBuilder {
460 let mut builder = BulkMemtableBuilder::new(
461 self.write_buffer_manager.clone(),
462 !dedup, merge_mode,
464 )
465 .with_compact_dispatcher(self.compact_dispatcher.clone());
466
467 if let Some(MemtableOptions::Bulk(config)) = &options.memtable {
468 builder = builder.with_config(config.clone());
469 }
470
471 builder
472 }
473
474 fn default_primary_key_memtable_builder(
475 &self,
476 dedup: bool,
477 merge_mode: MergeMode,
478 ) -> MemtableBuilderRef {
479 Arc::new(TimeSeriesMemtableBuilder::new(
480 self.write_buffer_manager.clone(),
481 dedup,
482 merge_mode,
483 ))
484 }
485}
486
487#[derive(Clone, Default)]
489pub struct MemScanMetrics(Arc<Mutex<MemScanMetricsData>>);
490
491impl MemScanMetrics {
492 pub(crate) fn merge_inner(&self, inner: &MemScanMetricsData) {
494 let mut metrics = self.0.lock().unwrap();
495 metrics.total_series += inner.total_series;
496 metrics.num_rows += inner.num_rows;
497 metrics.num_batches += inner.num_batches;
498 metrics.scan_cost += inner.scan_cost;
499 metrics.prefilter_cost += inner.prefilter_cost;
500 metrics.prefilter_rows_filtered += inner.prefilter_rows_filtered;
501 }
502
503 pub(crate) fn data(&self) -> MemScanMetricsData {
505 self.0.lock().unwrap().clone()
506 }
507}
508
509#[derive(Clone, Default)]
510pub(crate) struct MemScanMetricsData {
511 pub(crate) total_series: usize,
513 pub(crate) num_rows: usize,
515 pub(crate) num_batches: usize,
517 pub(crate) scan_cost: Duration,
519 pub(crate) prefilter_cost: Duration,
521 pub(crate) prefilter_rows_filtered: usize,
523}
524
525pub struct EncodedRange {
527 pub data: Bytes,
529 pub sst_info: SstInfo,
531}
532
533pub trait IterBuilder: Send + Sync {
536 fn build(&self, metrics: Option<MemScanMetrics>) -> Result<BoxedBatchIterator>;
538
539 fn is_record_batch(&self) -> bool {
541 false
542 }
543
544 fn build_record_batch(
548 &self,
549 time_range: Option<(Timestamp, Timestamp)>,
550 metrics: Option<MemScanMetrics>,
551 ) -> Result<BoxedRecordBatchIterator> {
552 let _metrics = metrics;
553 let _ = time_range;
554 UnsupportedOperationSnafu {
555 err_msg: "Record batch iterator is not supported by this memtable",
556 }
557 .fail()
558 }
559
560 fn encoded_range(&self) -> Option<EncodedRange> {
562 None
563 }
564}
565
566pub type BoxedIterBuilder = Box<dyn IterBuilder>;
567
568pub fn read_column_ids_from_projection(
573 metadata: &RegionMetadataRef,
574 projection: Option<&[ColumnId]>,
575) -> Vec<ColumnId> {
576 if let Some(projection) = projection {
577 projection.to_vec()
578 } else {
579 metadata
580 .column_metadatas
581 .iter()
582 .map(|c| c.column_id)
583 .collect()
584 }
585}
586
587pub struct BatchToRecordBatchContext {
589 metadata: RegionMetadataRef,
590 codec: Arc<dyn PrimaryKeyCodec>,
591 read_column_ids: Vec<ColumnId>,
592}
593
594impl BatchToRecordBatchContext {
595 pub fn new(metadata: RegionMetadataRef, mut read_column_ids: Vec<ColumnId>) -> Self {
597 if read_column_ids.is_empty() {
598 read_column_ids.push(metadata.time_index_column().column_id);
599 }
600
601 let codec = build_primary_key_codec(&metadata);
602 Self {
603 metadata,
604 codec,
605 read_column_ids,
606 }
607 }
608
609 fn adapt_iter(&self, iter: BoxedBatchIterator) -> BoxedRecordBatchIterator {
610 Box::new(BatchToRecordBatchAdapter::new(
611 iter,
612 self.metadata.clone(),
613 self.codec.clone(),
614 &self.read_column_ids,
615 ))
616 }
617}
618
619pub struct MemtableRangeContext {
621 id: MemtableId,
623 builder: BoxedIterBuilder,
625 predicate: PredicateGroup,
627 batch_to_record_batch: Option<Arc<BatchToRecordBatchContext>>,
629}
630
631pub type MemtableRangeContextRef = Arc<MemtableRangeContext>;
632
633impl MemtableRangeContext {
634 pub fn new(id: MemtableId, builder: BoxedIterBuilder, predicate: PredicateGroup) -> Self {
636 Self::new_with_batch_to_record_batch(id, builder, predicate, None)
637 }
638
639 pub fn new_with_batch_to_record_batch(
641 id: MemtableId,
642 builder: BoxedIterBuilder,
643 predicate: PredicateGroup,
644 batch_to_record_batch: Option<Arc<BatchToRecordBatchContext>>,
645 ) -> Self {
646 Self {
647 id,
648 builder,
649 predicate,
650 batch_to_record_batch,
651 }
652 }
653}
654
655#[derive(Clone)]
657pub struct MemtableRange {
658 context: MemtableRangeContextRef,
660 stats: MemtableStats,
662}
663
664impl MemtableRange {
665 pub fn new(context: MemtableRangeContextRef, stats: MemtableStats) -> Self {
667 Self { context, stats }
668 }
669
670 pub fn stats(&self) -> &MemtableStats {
672 &self.stats
673 }
674
675 pub fn id(&self) -> MemtableId {
677 self.context.id
678 }
679
680 pub fn build_prune_iter(
684 &self,
685 time_range: FileTimeRange,
686 metrics: Option<MemScanMetrics>,
687 ) -> Result<BoxedBatchIterator> {
688 let iter = self.context.builder.build(metrics)?;
689 let time_filters = self.context.predicate.time_filters();
690 Ok(Box::new(PruneTimeIterator::new(
691 iter,
692 time_range,
693 time_filters,
694 )))
695 }
696
697 pub fn build_iter(&self) -> Result<BoxedBatchIterator> {
699 self.context.builder.build(None)
700 }
701
702 pub fn build_record_batch_iter(
707 &self,
708 time_range: Option<FileTimeRange>,
709 metrics: Option<MemScanMetrics>,
710 ) -> Result<BoxedRecordBatchIterator> {
711 if self.context.builder.is_record_batch() {
712 return self.context.builder.build_record_batch(time_range, metrics);
713 }
714
715 if let Some(context) = self.context.batch_to_record_batch.as_ref() {
716 let iter = self.context.builder.build(metrics)?;
717 let iter: BoxedBatchIterator = if let Some(time_range) = time_range {
718 let time_filters = self.context.predicate.time_filters();
719 Box::new(PruneTimeIterator::new(iter, time_range, time_filters))
720 } else {
721 iter
722 };
723 return Ok(context.adapt_iter(iter));
724 }
725
726 UnsupportedOperationSnafu {
727 err_msg: "Record batch iterator is not supported by this memtable",
728 }
729 .fail()
730 }
731
732 pub fn is_record_batch(&self) -> bool {
734 self.context.builder.is_record_batch()
735 }
736
737 pub fn num_rows(&self) -> usize {
738 self.stats.num_rows
739 }
740
741 pub fn encoded(&self) -> Option<EncodedRange> {
743 self.context.builder.encoded_range()
744 }
745}
746
747#[cfg(test)]
748mod tests {
749 use std::sync::Arc;
750
751 use super::*;
752 use crate::flush::{WriteBufferManager, WriteBufferManagerImpl};
753 use crate::memtable::bulk::BulkMemtableConfig;
754
755 #[test]
756 fn test_alloc_tracker_without_manager() {
757 let tracker = AllocTracker::new(None);
758 assert_eq!(0, tracker.bytes_allocated());
759 tracker.on_allocation(100);
760 assert_eq!(100, tracker.bytes_allocated());
761 tracker.on_allocation(200);
762 assert_eq!(300, tracker.bytes_allocated());
763
764 tracker.done_allocating();
765 assert_eq!(300, tracker.bytes_allocated());
766 }
767
768 #[test]
769 fn test_alloc_tracker_with_manager() {
770 let manager = Arc::new(WriteBufferManagerImpl::new(1000));
771 {
772 let tracker = AllocTracker::new(Some(manager.clone() as WriteBufferManagerRef));
773
774 tracker.on_allocation(100);
775 assert_eq!(100, tracker.bytes_allocated());
776 assert_eq!(100, manager.memory_usage());
777 assert_eq!(100, manager.mutable_usage());
778
779 for _ in 0..2 {
780 tracker.done_allocating();
782 assert_eq!(100, manager.memory_usage());
783 assert_eq!(0, manager.mutable_usage());
784 }
785 }
786
787 assert_eq!(0, manager.memory_usage());
788 assert_eq!(0, manager.mutable_usage());
789 }
790
791 #[test]
792 fn test_alloc_tracker_without_done_allocating() {
793 let manager = Arc::new(WriteBufferManagerImpl::new(1000));
794 {
795 let tracker = AllocTracker::new(Some(manager.clone() as WriteBufferManagerRef));
796
797 tracker.on_allocation(100);
798 assert_eq!(100, tracker.bytes_allocated());
799 assert_eq!(100, manager.memory_usage());
800 assert_eq!(100, manager.mutable_usage());
801 }
802
803 assert_eq!(0, manager.memory_usage());
804 assert_eq!(0, manager.mutable_usage());
805 }
806
807 #[test]
808 fn test_forced_bulk_memtable_preserves_bulk_config() {
809 let provider = MemtableBuilderProvider::new(None, Arc::new(MitoConfig::default()));
810 let config = BulkMemtableConfig {
811 merge_threshold: 7,
812 encode_row_threshold: 11,
813 encode_bytes_threshold: 13,
814 max_merge_groups: 17,
815 };
816 let options = RegionOptions {
817 memtable: Some(MemtableOptions::Bulk(config.clone())),
818 primary_key_encoding: Some(PrimaryKeyEncoding::Sparse),
819 ..Default::default()
820 };
821
822 let builder =
823 provider.bulk_memtable_builder(options.need_dedup(), options.merge_mode(), &options);
824
825 assert_eq!(&config, builder.config());
826 }
827}