1pub mod batch_adapter;
18pub mod compat;
19pub mod dedup;
20pub mod flat_dedup;
21pub mod flat_merge;
22pub mod flat_projection;
23pub mod last_row;
24pub mod projection;
25pub(crate) mod prune;
26pub(crate) mod pruner;
27pub mod range;
28#[cfg(feature = "test")]
29pub mod range_cache;
30#[cfg(not(feature = "test"))]
31pub(crate) mod range_cache;
32pub mod scan_region;
33pub mod scan_util;
34pub(crate) mod seq_scan;
35pub mod series_scan;
36pub mod stream;
37pub(crate) mod unordered_scan;
38
39use std::collections::HashMap;
40use std::sync::Arc;
41use std::time::Duration;
42
43use api::v1::OpType;
44use async_trait::async_trait;
45use common_time::Timestamp;
46use datafusion_common::arrow::array::UInt8Array;
47use datatypes::arrow;
48use datatypes::arrow::array::{Array, ArrayRef};
49use datatypes::arrow::compute::SortOptions;
50use datatypes::arrow::record_batch::RecordBatch;
51use datatypes::arrow::row::{RowConverter, SortField};
52use datatypes::prelude::{ConcreteDataType, DataType, ScalarVector};
53use datatypes::scalars::ScalarVectorBuilder;
54use datatypes::types::TimestampType;
55use datatypes::value::{Value, ValueRef};
56use datatypes::vectors::{
57 BooleanVector, Helper, TimestampMicrosecondVector, TimestampMillisecondVector,
58 TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampSecondVector,
59 UInt8Vector, UInt8VectorBuilder, UInt32Vector, UInt64Vector, UInt64VectorBuilder, Vector,
60 VectorRef,
61};
62use futures::TryStreamExt;
63use futures::stream::BoxStream;
64use mito_codec::row_converter::{CompositeValues, PrimaryKeyCodec};
65use snafu::{OptionExt, ResultExt, ensure};
66use store_api::storage::{ColumnId, SequenceNumber, SequenceRange};
67
68use crate::error::{
69 ComputeArrowSnafu, ComputeVectorSnafu, ConvertVectorSnafu, DecodeSnafu, InvalidBatchSnafu,
70 Result,
71};
72use crate::memtable::{BoxedBatchIterator, BoxedRecordBatchIterator};
73#[derive(Debug, PartialEq, Clone)]
78pub struct Batch {
79 primary_key: Vec<u8>,
81 pk_values: Option<CompositeValues>,
83 timestamps: VectorRef,
85 sequences: Arc<UInt64Vector>,
89 op_types: Arc<UInt8Vector>,
93 fields: Vec<BatchColumn>,
95 fields_idx: Option<HashMap<ColumnId, usize>>,
97}
98
99impl Batch {
100 pub fn new(
102 primary_key: Vec<u8>,
103 timestamps: VectorRef,
104 sequences: Arc<UInt64Vector>,
105 op_types: Arc<UInt8Vector>,
106 fields: Vec<BatchColumn>,
107 ) -> Result<Batch> {
108 BatchBuilder::with_required_columns(primary_key, timestamps, sequences, op_types)
109 .with_fields(fields)
110 .build()
111 }
112
113 pub fn with_fields(self, fields: Vec<BatchColumn>) -> Result<Batch> {
115 Batch::new(
116 self.primary_key,
117 self.timestamps,
118 self.sequences,
119 self.op_types,
120 fields,
121 )
122 }
123
124 pub fn primary_key(&self) -> &[u8] {
126 &self.primary_key
127 }
128
129 pub fn pk_values(&self) -> Option<&CompositeValues> {
131 self.pk_values.as_ref()
132 }
133
134 pub fn set_pk_values(&mut self, pk_values: CompositeValues) {
136 self.pk_values = Some(pk_values);
137 }
138
139 #[cfg(any(test, feature = "test"))]
141 pub fn remove_pk_values(&mut self) {
142 self.pk_values = None;
143 }
144
145 pub fn fields(&self) -> &[BatchColumn] {
147 &self.fields
148 }
149
150 pub fn timestamps(&self) -> &VectorRef {
152 &self.timestamps
153 }
154
155 pub fn sequences(&self) -> &Arc<UInt64Vector> {
157 &self.sequences
158 }
159
160 pub fn op_types(&self) -> &Arc<UInt8Vector> {
162 &self.op_types
163 }
164
165 pub fn num_rows(&self) -> usize {
167 self.sequences.len()
170 }
171
172 #[allow(dead_code)]
174 pub(crate) fn empty() -> Self {
175 Self {
176 primary_key: vec![],
177 pk_values: None,
178 timestamps: Arc::new(TimestampMillisecondVectorBuilder::with_capacity(0).finish()),
179 sequences: Arc::new(UInt64VectorBuilder::with_capacity(0).finish()),
180 op_types: Arc::new(UInt8VectorBuilder::with_capacity(0).finish()),
181 fields: vec![],
182 fields_idx: None,
183 }
184 }
185
186 pub fn is_empty(&self) -> bool {
188 self.num_rows() == 0
189 }
190
191 pub fn first_timestamp(&self) -> Option<Timestamp> {
193 if self.timestamps.is_empty() {
194 return None;
195 }
196
197 Some(self.get_timestamp(0))
198 }
199
200 pub fn last_timestamp(&self) -> Option<Timestamp> {
202 if self.timestamps.is_empty() {
203 return None;
204 }
205
206 Some(self.get_timestamp(self.timestamps.len() - 1))
207 }
208
209 pub fn first_sequence(&self) -> Option<SequenceNumber> {
211 if self.sequences.is_empty() {
212 return None;
213 }
214
215 Some(self.get_sequence(0))
216 }
217
218 pub fn last_sequence(&self) -> Option<SequenceNumber> {
220 if self.sequences.is_empty() {
221 return None;
222 }
223
224 Some(self.get_sequence(self.sequences.len() - 1))
225 }
226
227 pub fn set_primary_key(&mut self, primary_key: Vec<u8>) {
232 self.primary_key = primary_key;
233 }
234
235 pub fn slice(&self, offset: usize, length: usize) -> Batch {
240 let fields = self
241 .fields
242 .iter()
243 .map(|column| BatchColumn {
244 column_id: column.column_id,
245 data: column.data.slice(offset, length),
246 })
247 .collect();
248 Batch {
250 primary_key: self.primary_key.clone(),
253 pk_values: self.pk_values.clone(),
254 timestamps: self.timestamps.slice(offset, length),
255 sequences: Arc::new(self.sequences.get_slice(offset, length)),
256 op_types: Arc::new(self.op_types.get_slice(offset, length)),
257 fields,
258 fields_idx: self.fields_idx.clone(),
259 }
260 }
261
262 pub fn concat(mut batches: Vec<Batch>) -> Result<Batch> {
266 ensure!(
267 !batches.is_empty(),
268 InvalidBatchSnafu {
269 reason: "empty batches",
270 }
271 );
272 if batches.len() == 1 {
273 return Ok(batches.pop().unwrap());
275 }
276
277 let primary_key = std::mem::take(&mut batches[0].primary_key);
278 let first = &batches[0];
279 ensure!(
281 batches
282 .iter()
283 .skip(1)
284 .all(|b| b.primary_key() == primary_key),
285 InvalidBatchSnafu {
286 reason: "batches have different primary key",
287 }
288 );
289 for b in batches.iter().skip(1) {
290 ensure!(
291 b.fields.len() == first.fields.len(),
292 InvalidBatchSnafu {
293 reason: "batches have different field num",
294 }
295 );
296 for (l, r) in b.fields.iter().zip(&first.fields) {
297 ensure!(
298 l.column_id == r.column_id,
299 InvalidBatchSnafu {
300 reason: "batches have different fields",
301 }
302 );
303 }
304 }
305
306 let mut builder = BatchBuilder::new(primary_key);
308 let array = concat_arrays(batches.iter().map(|b| b.timestamps().to_arrow_array()))?;
310 builder.timestamps_array(array)?;
311 let array = concat_arrays(batches.iter().map(|b| b.sequences().to_arrow_array()))?;
312 builder.sequences_array(array)?;
313 let array = concat_arrays(batches.iter().map(|b| b.op_types().to_arrow_array()))?;
314 builder.op_types_array(array)?;
315 for (i, batch_column) in first.fields.iter().enumerate() {
316 let array = concat_arrays(batches.iter().map(|b| b.fields()[i].data.to_arrow_array()))?;
317 builder.push_field_array(batch_column.column_id, array)?;
318 }
319
320 builder.build()
321 }
322
323 pub fn filter_deleted(&mut self) -> Result<()> {
325 let array = self.op_types.as_arrow();
327 let rhs = UInt8Array::new_scalar(OpType::Delete as u8);
329 let predicate =
330 arrow::compute::kernels::cmp::neq(array, &rhs).context(ComputeArrowSnafu)?;
331 self.filter(&BooleanVector::from(predicate))
332 }
333
334 pub fn filter(&mut self, predicate: &BooleanVector) -> Result<()> {
337 self.timestamps = self
338 .timestamps
339 .filter(predicate)
340 .context(ComputeVectorSnafu)?;
341 self.sequences = Arc::new(
342 UInt64Vector::try_from_arrow_array(
343 arrow::compute::filter(self.sequences.as_arrow(), predicate.as_boolean_array())
344 .context(ComputeArrowSnafu)?,
345 )
346 .unwrap(),
347 );
348 self.op_types = Arc::new(
349 UInt8Vector::try_from_arrow_array(
350 arrow::compute::filter(self.op_types.as_arrow(), predicate.as_boolean_array())
351 .context(ComputeArrowSnafu)?,
352 )
353 .unwrap(),
354 );
355 for batch_column in &mut self.fields {
356 batch_column.data = batch_column
357 .data
358 .filter(predicate)
359 .context(ComputeVectorSnafu)?;
360 }
361
362 Ok(())
363 }
364
365 pub fn filter_by_sequence(&mut self, sequence: Option<SequenceRange>) -> Result<()> {
367 let seq_range = match sequence {
368 None => return Ok(()),
369 Some(seq_range) => {
370 let (Some(first), Some(last)) = (self.first_sequence(), self.last_sequence())
371 else {
372 return Ok(());
373 };
374 let is_subset = match seq_range {
375 SequenceRange::Gt { min } => min < first,
376 SequenceRange::LtEq { max } => max >= last,
377 SequenceRange::GtLtEq { min, max } => min < first && max >= last,
378 };
379 if is_subset {
380 return Ok(());
381 }
382 seq_range
383 }
384 };
385
386 let seqs = self.sequences.as_arrow();
387 let predicate = seq_range.filter(seqs).context(ComputeArrowSnafu)?;
388
389 let predicate = BooleanVector::from(predicate);
390 self.filter(&predicate)?;
391
392 Ok(())
393 }
394
395 pub fn sort(&mut self, dedup: bool) -> Result<()> {
402 let converter = RowConverter::new(vec![
405 SortField::new(self.timestamps.data_type().as_arrow_type()),
406 SortField::new_with_options(
407 self.sequences.data_type().as_arrow_type(),
408 SortOptions {
409 descending: true,
410 ..Default::default()
411 },
412 ),
413 ])
414 .context(ComputeArrowSnafu)?;
415 let columns = [
417 self.timestamps.to_arrow_array(),
418 self.sequences.to_arrow_array(),
419 ];
420 let rows = converter.convert_columns(&columns).unwrap();
421 let mut to_sort: Vec<_> = rows.iter().enumerate().collect();
422
423 let was_sorted = to_sort.is_sorted_by_key(|x| x.1);
424 if !was_sorted {
425 to_sort.sort_unstable_by_key(|x| x.1);
426 }
427
428 let num_rows = to_sort.len();
429 if dedup {
430 to_sort.dedup_by(|left, right| {
432 debug_assert_eq!(18, left.1.as_ref().len());
433 debug_assert_eq!(18, right.1.as_ref().len());
434 let (left_key, right_key) = (left.1.as_ref(), right.1.as_ref());
435 left_key[..TIMESTAMP_KEY_LEN] == right_key[..TIMESTAMP_KEY_LEN]
437 });
438 }
439 let no_dedup = to_sort.len() == num_rows;
440
441 if was_sorted && no_dedup {
442 return Ok(());
443 }
444 let indices = UInt32Vector::from_iter_values(to_sort.iter().map(|v| v.0 as u32));
445 self.take_in_place(&indices)
446 }
447
448 pub(crate) fn merge_last_non_null(&mut self) -> Result<()> {
456 let num_rows = self.num_rows();
457 if num_rows < 2 {
458 return Ok(());
459 }
460
461 let Some(timestamps) = self.timestamps_native() else {
462 return Ok(());
463 };
464
465 let mut has_dup = false;
467 let mut group_count = 1;
468 for i in 1..num_rows {
469 has_dup |= timestamps[i] == timestamps[i - 1];
470 group_count += (timestamps[i] != timestamps[i - 1]) as usize;
471 }
472 if !has_dup {
473 return Ok(());
474 }
475
476 let num_fields = self.fields.len();
477 let op_types = self.op_types.as_arrow().values();
478
479 let mut base_indices: Vec<u32> = Vec::with_capacity(group_count);
480 let mut field_indices: Vec<Vec<u32>> = (0..num_fields)
481 .map(|_| Vec::with_capacity(group_count))
482 .collect();
483
484 let mut start = 0;
485 while start < num_rows {
486 let ts = timestamps[start];
487 let mut end = start + 1;
488 while end < num_rows && timestamps[end] == ts {
489 end += 1;
490 }
491
492 let group_pos = base_indices.len();
493 base_indices.push(start as u32);
494
495 if num_fields > 0 {
496 for idx in &mut field_indices {
498 idx.push(start as u32);
499 }
500
501 let base_deleted = op_types[start] == OpType::Delete as u8;
502 if !base_deleted {
503 let mut missing_fields = Vec::new();
506 for (field_idx, col) in self.fields.iter().enumerate() {
507 if col.data.is_null(start) {
508 missing_fields.push(field_idx);
509 }
510 }
511
512 if !missing_fields.is_empty() {
513 for row_idx in (start + 1)..end {
514 if op_types[row_idx] == OpType::Delete as u8 {
515 break;
516 }
517
518 missing_fields.retain(|&field_idx| {
519 if self.fields[field_idx].data.is_null(row_idx) {
520 true
521 } else {
522 field_indices[field_idx][group_pos] = row_idx as u32;
523 false
524 }
525 });
526
527 if missing_fields.is_empty() {
528 break;
529 }
530 }
531 }
532 }
533 }
534
535 start = end;
536 }
537
538 let base_indices = UInt32Vector::from_vec(base_indices);
539 self.timestamps = self
540 .timestamps
541 .take(&base_indices)
542 .context(ComputeVectorSnafu)?;
543 let array = arrow::compute::take(self.sequences.as_arrow(), base_indices.as_arrow(), None)
544 .context(ComputeArrowSnafu)?;
545 self.sequences = Arc::new(UInt64Vector::try_from_arrow_array(array).unwrap());
547 let array = arrow::compute::take(self.op_types.as_arrow(), base_indices.as_arrow(), None)
548 .context(ComputeArrowSnafu)?;
549 self.op_types = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
551
552 for (field_idx, batch_column) in self.fields.iter_mut().enumerate() {
553 let idx = UInt32Vector::from_vec(std::mem::take(&mut field_indices[field_idx]));
554 batch_column.data = batch_column.data.take(&idx).context(ComputeVectorSnafu)?;
555 }
556
557 Ok(())
558 }
559
560 pub fn memory_size(&self) -> usize {
562 let mut size = std::mem::size_of::<Self>();
563 size += self.primary_key.len();
564 size += self.timestamps.memory_size();
565 size += self.sequences.memory_size();
566 size += self.op_types.memory_size();
567 for batch_column in &self.fields {
568 size += batch_column.data.memory_size();
569 }
570 size
571 }
572
573 pub(crate) fn timestamps_native(&self) -> Option<&[i64]> {
575 if self.timestamps.is_empty() {
576 return None;
577 }
578
579 let values = match self.timestamps.data_type() {
580 ConcreteDataType::Timestamp(TimestampType::Second(_)) => self
581 .timestamps
582 .as_any()
583 .downcast_ref::<TimestampSecondVector>()
584 .unwrap()
585 .as_arrow()
586 .values(),
587 ConcreteDataType::Timestamp(TimestampType::Millisecond(_)) => self
588 .timestamps
589 .as_any()
590 .downcast_ref::<TimestampMillisecondVector>()
591 .unwrap()
592 .as_arrow()
593 .values(),
594 ConcreteDataType::Timestamp(TimestampType::Microsecond(_)) => self
595 .timestamps
596 .as_any()
597 .downcast_ref::<TimestampMicrosecondVector>()
598 .unwrap()
599 .as_arrow()
600 .values(),
601 ConcreteDataType::Timestamp(TimestampType::Nanosecond(_)) => self
602 .timestamps
603 .as_any()
604 .downcast_ref::<TimestampNanosecondVector>()
605 .unwrap()
606 .as_arrow()
607 .values(),
608 other => panic!("timestamps in a Batch has other type {:?}", other),
609 };
610
611 Some(values)
612 }
613
614 fn take_in_place(&mut self, indices: &UInt32Vector) -> Result<()> {
616 self.timestamps = self.timestamps.take(indices).context(ComputeVectorSnafu)?;
617 let array = arrow::compute::take(self.sequences.as_arrow(), indices.as_arrow(), None)
618 .context(ComputeArrowSnafu)?;
619 self.sequences = Arc::new(UInt64Vector::try_from_arrow_array(array).unwrap());
621 let array = arrow::compute::take(self.op_types.as_arrow(), indices.as_arrow(), None)
622 .context(ComputeArrowSnafu)?;
623 self.op_types = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
624 for batch_column in &mut self.fields {
625 batch_column.data = batch_column
626 .data
627 .take(indices)
628 .context(ComputeVectorSnafu)?;
629 }
630
631 Ok(())
632 }
633
634 fn get_timestamp(&self, index: usize) -> Timestamp {
639 match self.timestamps.get_ref(index) {
640 ValueRef::Timestamp(timestamp) => timestamp,
641
642 value => panic!("{:?} is not a timestamp", value),
644 }
645 }
646
647 pub(crate) fn get_sequence(&self, index: usize) -> SequenceNumber {
652 self.sequences.get_data(index).unwrap()
654 }
655
656 #[cfg(debug_assertions)]
658 #[allow(dead_code)]
659 pub(crate) fn check_monotonic(&self) -> Result<(), String> {
660 use std::cmp::Ordering;
661 if self.timestamps_native().is_none() {
662 return Ok(());
663 }
664
665 let timestamps = self.timestamps_native().unwrap();
666 let sequences = self.sequences.as_arrow().values();
667 for (i, window) in timestamps.windows(2).enumerate() {
668 let current = window[0];
669 let next = window[1];
670 let current_sequence = sequences[i];
671 let next_sequence = sequences[i + 1];
672 match current.cmp(&next) {
673 Ordering::Less => {
674 continue;
676 }
677 Ordering::Equal => {
678 if current_sequence < next_sequence {
680 return Err(format!(
681 "sequence are not monotonic: ts {} == {} but current sequence {} < {}, index: {}",
682 current, next, current_sequence, next_sequence, i
683 ));
684 }
685 }
686 Ordering::Greater => {
687 return Err(format!(
689 "timestamps are not monotonic: {} > {}, index: {}",
690 current, next, i
691 ));
692 }
693 }
694 }
695
696 Ok(())
697 }
698
699 #[cfg(debug_assertions)]
701 #[allow(dead_code)]
702 pub(crate) fn check_next_batch(&self, other: &Batch) -> Result<(), String> {
703 if self.primary_key() < other.primary_key() {
705 return Ok(());
706 }
707 if self.primary_key() > other.primary_key() {
708 return Err(format!(
709 "primary key is not monotonic: {:?} > {:?}",
710 self.primary_key(),
711 other.primary_key()
712 ));
713 }
714 if self.last_timestamp() < other.first_timestamp() {
716 return Ok(());
717 }
718 if self.last_timestamp() > other.first_timestamp() {
719 return Err(format!(
720 "timestamps are not monotonic: {:?} > {:?}",
721 self.last_timestamp(),
722 other.first_timestamp()
723 ));
724 }
725 if self.last_sequence() >= other.first_sequence() {
727 return Ok(());
728 }
729 Err(format!(
730 "sequences are not monotonic: {:?} < {:?}",
731 self.last_sequence(),
732 other.first_sequence()
733 ))
734 }
735
736 pub fn pk_col_value(
740 &mut self,
741 codec: &dyn PrimaryKeyCodec,
742 col_idx_in_pk: usize,
743 column_id: ColumnId,
744 ) -> Result<Option<&Value>> {
745 if self.pk_values.is_none() {
746 self.pk_values = Some(codec.decode(&self.primary_key).context(DecodeSnafu)?);
747 }
748
749 let pk_values = self.pk_values.as_ref().unwrap();
750 Ok(match pk_values {
751 CompositeValues::Dense(values) => values.get(col_idx_in_pk).map(|(_, v)| v),
752 CompositeValues::Sparse(values) => values.get(&column_id),
753 })
754 }
755
756 pub fn field_col_value(&mut self, column_id: ColumnId) -> Option<&BatchColumn> {
760 if self.fields_idx.is_none() {
761 self.fields_idx = Some(
762 self.fields
763 .iter()
764 .enumerate()
765 .map(|(i, c)| (c.column_id, i))
766 .collect(),
767 );
768 }
769
770 self.fields_idx
771 .as_ref()
772 .unwrap()
773 .get(&column_id)
774 .map(|&idx| &self.fields[idx])
775 }
776}
777
778#[cfg(debug_assertions)]
780#[derive(Default)]
781#[allow(dead_code)]
782pub(crate) struct BatchChecker {
783 last_batch: Option<Batch>,
784 start: Option<Timestamp>,
785 end: Option<Timestamp>,
786}
787
788#[cfg(debug_assertions)]
789#[allow(dead_code)]
790impl BatchChecker {
791 pub(crate) fn with_start(mut self, start: Option<Timestamp>) -> Self {
793 self.start = start;
794 self
795 }
796
797 pub(crate) fn with_end(mut self, end: Option<Timestamp>) -> Self {
799 self.end = end;
800 self
801 }
802
803 pub(crate) fn check_monotonic(&mut self, batch: &Batch) -> Result<(), String> {
806 batch.check_monotonic()?;
807
808 if let (Some(start), Some(first)) = (self.start, batch.first_timestamp())
809 && start > first
810 {
811 return Err(format!(
812 "batch's first timestamp is before the start timestamp: {:?} > {:?}",
813 start, first
814 ));
815 }
816 if let (Some(end), Some(last)) = (self.end, batch.last_timestamp())
817 && end <= last
818 {
819 return Err(format!(
820 "batch's last timestamp is after the end timestamp: {:?} <= {:?}",
821 end, last
822 ));
823 }
824
825 let res = self
828 .last_batch
829 .as_ref()
830 .map(|last| last.check_next_batch(batch))
831 .unwrap_or(Ok(()));
832 self.last_batch = Some(batch.clone());
833 res
834 }
835
836 pub(crate) fn format_batch(&self, batch: &Batch) -> String {
838 use std::fmt::Write;
839
840 let mut message = String::new();
841 if let Some(last) = &self.last_batch {
842 write!(
843 message,
844 "last_pk: {:?}, last_ts: {:?}, last_seq: {:?}, ",
845 last.primary_key(),
846 last.last_timestamp(),
847 last.last_sequence()
848 )
849 .unwrap();
850 }
851 write!(
852 message,
853 "batch_pk: {:?}, batch_ts: {:?}, batch_seq: {:?}",
854 batch.primary_key(),
855 batch.timestamps(),
856 batch.sequences()
857 )
858 .unwrap();
859
860 message
861 }
862
863 pub(crate) fn ensure_part_range_batch(
865 &mut self,
866 scanner: &str,
867 region_id: store_api::storage::RegionId,
868 partition: usize,
869 part_range: store_api::region_engine::PartitionRange,
870 batch: &Batch,
871 ) {
872 if let Err(e) = self.check_monotonic(batch) {
873 let err_msg = format!(
874 "{}: batch is not sorted, {}, region_id: {}, partition: {}, part_range: {:?}",
875 scanner, e, region_id, partition, part_range,
876 );
877 common_telemetry::error!("{err_msg}, {}", self.format_batch(batch));
878 panic!("{err_msg}, batch rows: {}", batch.num_rows());
880 }
881 }
882}
883
884const TIMESTAMP_KEY_LEN: usize = 9;
886
887fn concat_arrays(iter: impl Iterator<Item = ArrayRef>) -> Result<ArrayRef> {
889 let arrays: Vec<_> = iter.collect();
890 let dyn_arrays: Vec<_> = arrays.iter().map(|array| array.as_ref()).collect();
891 arrow::compute::concat(&dyn_arrays).context(ComputeArrowSnafu)
892}
893
894#[derive(Debug, PartialEq, Eq, Clone)]
896pub struct BatchColumn {
897 pub column_id: ColumnId,
899 pub data: VectorRef,
901}
902
903pub struct BatchBuilder {
905 primary_key: Vec<u8>,
906 timestamps: Option<VectorRef>,
907 sequences: Option<Arc<UInt64Vector>>,
908 op_types: Option<Arc<UInt8Vector>>,
909 fields: Vec<BatchColumn>,
910}
911
912impl BatchBuilder {
913 pub fn new(primary_key: Vec<u8>) -> BatchBuilder {
915 BatchBuilder {
916 primary_key,
917 timestamps: None,
918 sequences: None,
919 op_types: None,
920 fields: Vec::new(),
921 }
922 }
923
924 pub fn with_required_columns(
926 primary_key: Vec<u8>,
927 timestamps: VectorRef,
928 sequences: Arc<UInt64Vector>,
929 op_types: Arc<UInt8Vector>,
930 ) -> BatchBuilder {
931 BatchBuilder {
932 primary_key,
933 timestamps: Some(timestamps),
934 sequences: Some(sequences),
935 op_types: Some(op_types),
936 fields: Vec::new(),
937 }
938 }
939
940 pub fn with_fields(mut self, fields: Vec<BatchColumn>) -> Self {
942 self.fields = fields;
943 self
944 }
945
946 pub fn push_field(&mut self, column: BatchColumn) -> &mut Self {
948 self.fields.push(column);
949 self
950 }
951
952 pub fn push_field_array(&mut self, column_id: ColumnId, array: ArrayRef) -> Result<&mut Self> {
954 let vector = Helper::try_into_vector(array).context(ConvertVectorSnafu)?;
955 self.fields.push(BatchColumn {
956 column_id,
957 data: vector,
958 });
959
960 Ok(self)
961 }
962
963 pub fn timestamps_array(&mut self, array: ArrayRef) -> Result<&mut Self> {
965 let vector = Helper::try_into_vector(array).context(ConvertVectorSnafu)?;
966 ensure!(
967 vector.data_type().is_timestamp(),
968 InvalidBatchSnafu {
969 reason: format!("{:?} is not a timestamp type", vector.data_type()),
970 }
971 );
972
973 self.timestamps = Some(vector);
974 Ok(self)
975 }
976
977 pub fn sequences_array(&mut self, array: ArrayRef) -> Result<&mut Self> {
979 ensure!(
980 *array.data_type() == arrow::datatypes::DataType::UInt64,
981 InvalidBatchSnafu {
982 reason: "sequence array is not UInt64 type",
983 }
984 );
985 let vector = Arc::new(UInt64Vector::try_from_arrow_array(array).unwrap());
987 self.sequences = Some(vector);
988
989 Ok(self)
990 }
991
992 pub fn op_types_array(&mut self, array: ArrayRef) -> Result<&mut Self> {
994 ensure!(
995 *array.data_type() == arrow::datatypes::DataType::UInt8,
996 InvalidBatchSnafu {
997 reason: "sequence array is not UInt8 type",
998 }
999 );
1000 let vector = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
1002 self.op_types = Some(vector);
1003
1004 Ok(self)
1005 }
1006
1007 pub fn build(self) -> Result<Batch> {
1009 let timestamps = self.timestamps.context(InvalidBatchSnafu {
1010 reason: "missing timestamps",
1011 })?;
1012 let sequences = self.sequences.context(InvalidBatchSnafu {
1013 reason: "missing sequences",
1014 })?;
1015 let op_types = self.op_types.context(InvalidBatchSnafu {
1016 reason: "missing op_types",
1017 })?;
1018 assert_eq!(0, timestamps.null_count());
1021 assert_eq!(0, sequences.null_count());
1022 assert_eq!(0, op_types.null_count());
1023
1024 let ts_len = timestamps.len();
1025 ensure!(
1026 sequences.len() == ts_len,
1027 InvalidBatchSnafu {
1028 reason: format!(
1029 "sequence have different len {} != {}",
1030 sequences.len(),
1031 ts_len
1032 ),
1033 }
1034 );
1035 ensure!(
1036 op_types.len() == ts_len,
1037 InvalidBatchSnafu {
1038 reason: format!(
1039 "op type have different len {} != {}",
1040 op_types.len(),
1041 ts_len
1042 ),
1043 }
1044 );
1045 for column in &self.fields {
1046 ensure!(
1047 column.data.len() == ts_len,
1048 InvalidBatchSnafu {
1049 reason: format!(
1050 "column {} has different len {} != {}",
1051 column.column_id,
1052 column.data.len(),
1053 ts_len
1054 ),
1055 }
1056 );
1057 }
1058
1059 Ok(Batch {
1060 primary_key: self.primary_key,
1061 pk_values: None,
1062 timestamps,
1063 sequences,
1064 op_types,
1065 fields: self.fields,
1066 fields_idx: None,
1067 })
1068 }
1069}
1070
1071impl From<Batch> for BatchBuilder {
1072 fn from(batch: Batch) -> Self {
1073 Self {
1074 primary_key: batch.primary_key,
1075 timestamps: Some(batch.timestamps),
1076 sequences: Some(batch.sequences),
1077 op_types: Some(batch.op_types),
1078 fields: batch.fields,
1079 }
1080 }
1081}
1082
1083pub enum Source {
1087 Reader(BoxedBatchReader),
1089 Iter(BoxedBatchIterator),
1091 Stream(BoxedBatchStream),
1093}
1094
1095impl Source {
1096 pub async fn next_batch(&mut self) -> Result<Option<Batch>> {
1098 match self {
1099 Source::Reader(reader) => reader.next_batch().await,
1100 Source::Iter(iter) => iter.next().transpose(),
1101 Source::Stream(stream) => stream.try_next().await,
1102 }
1103 }
1104}
1105
1106pub enum FlatSource {
1108 Iter(BoxedRecordBatchIterator),
1110 Stream(BoxedRecordBatchStream),
1112}
1113
1114impl FlatSource {
1115 pub async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
1117 match self {
1118 FlatSource::Iter(iter) => iter.next().transpose(),
1119 FlatSource::Stream(stream) => stream.try_next().await,
1120 }
1121 }
1122}
1123
1124#[async_trait]
1128pub trait BatchReader: Send {
1129 async fn next_batch(&mut self) -> Result<Option<Batch>>;
1137}
1138
1139pub type BoxedBatchReader = Box<dyn BatchReader>;
1141
1142pub type BoxedBatchStream = BoxStream<'static, Result<Batch>>;
1144
1145pub type BoxedRecordBatchStream = BoxStream<'static, Result<RecordBatch>>;
1147
1148#[async_trait::async_trait]
1149impl<T: BatchReader + ?Sized> BatchReader for Box<T> {
1150 async fn next_batch(&mut self) -> Result<Option<Batch>> {
1151 (**self).next_batch().await
1152 }
1153}
1154
1155#[derive(Debug, Default)]
1157pub(crate) struct ScannerMetrics {
1158 scan_cost: Duration,
1160 yield_cost: Duration,
1162 num_batches: usize,
1164 num_rows: usize,
1166}
1167
1168#[cfg(test)]
1169mod tests {
1170 use datatypes::arrow::array::{TimestampMillisecondArray, UInt8Array, UInt64Array};
1171 use mito_codec::row_converter::{self, build_primary_key_codec_with_fields};
1172 use store_api::codec::PrimaryKeyEncoding;
1173 use store_api::storage::consts::ReservedColumnId;
1174
1175 use super::*;
1176 use crate::error::Error;
1177 use crate::test_util::new_batch_builder;
1178
1179 fn new_batch(
1180 timestamps: &[i64],
1181 sequences: &[u64],
1182 op_types: &[OpType],
1183 field: &[u64],
1184 ) -> Batch {
1185 new_batch_builder(b"test", timestamps, sequences, op_types, 1, field)
1186 .build()
1187 .unwrap()
1188 }
1189
1190 fn new_batch_with_u64_fields(
1191 timestamps: &[i64],
1192 sequences: &[u64],
1193 op_types: &[OpType],
1194 fields: &[(ColumnId, &[Option<u64>])],
1195 ) -> Batch {
1196 assert_eq!(timestamps.len(), sequences.len());
1197 assert_eq!(timestamps.len(), op_types.len());
1198 for (_, values) in fields {
1199 assert_eq!(timestamps.len(), values.len());
1200 }
1201
1202 let mut builder = BatchBuilder::new(b"test".to_vec());
1203 builder
1204 .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values(
1205 timestamps.iter().copied(),
1206 )))
1207 .unwrap()
1208 .sequences_array(Arc::new(UInt64Array::from_iter_values(
1209 sequences.iter().copied(),
1210 )))
1211 .unwrap()
1212 .op_types_array(Arc::new(UInt8Array::from_iter_values(
1213 op_types.iter().map(|v| *v as u8),
1214 )))
1215 .unwrap();
1216
1217 for (col_id, values) in fields {
1218 builder
1219 .push_field_array(*col_id, Arc::new(UInt64Array::from(values.to_vec())))
1220 .unwrap();
1221 }
1222
1223 builder.build().unwrap()
1224 }
1225
1226 fn new_batch_without_fields(
1227 timestamps: &[i64],
1228 sequences: &[u64],
1229 op_types: &[OpType],
1230 ) -> Batch {
1231 assert_eq!(timestamps.len(), sequences.len());
1232 assert_eq!(timestamps.len(), op_types.len());
1233
1234 let mut builder = BatchBuilder::new(b"test".to_vec());
1235 builder
1236 .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values(
1237 timestamps.iter().copied(),
1238 )))
1239 .unwrap()
1240 .sequences_array(Arc::new(UInt64Array::from_iter_values(
1241 sequences.iter().copied(),
1242 )))
1243 .unwrap()
1244 .op_types_array(Arc::new(UInt8Array::from_iter_values(
1245 op_types.iter().map(|v| *v as u8),
1246 )))
1247 .unwrap();
1248
1249 builder.build().unwrap()
1250 }
1251
1252 #[test]
1253 fn test_empty_batch() {
1254 let batch = Batch::empty();
1255 assert!(batch.is_empty());
1256 assert_eq!(None, batch.first_timestamp());
1257 assert_eq!(None, batch.last_timestamp());
1258 assert_eq!(None, batch.first_sequence());
1259 assert_eq!(None, batch.last_sequence());
1260 assert!(batch.timestamps_native().is_none());
1261 }
1262
1263 #[test]
1264 fn test_first_last_one() {
1265 let batch = new_batch(&[1], &[2], &[OpType::Put], &[4]);
1266 assert_eq!(
1267 Timestamp::new_millisecond(1),
1268 batch.first_timestamp().unwrap()
1269 );
1270 assert_eq!(
1271 Timestamp::new_millisecond(1),
1272 batch.last_timestamp().unwrap()
1273 );
1274 assert_eq!(2, batch.first_sequence().unwrap());
1275 assert_eq!(2, batch.last_sequence().unwrap());
1276 }
1277
1278 #[test]
1279 fn test_first_last_multiple() {
1280 let batch = new_batch(
1281 &[1, 2, 3],
1282 &[11, 12, 13],
1283 &[OpType::Put, OpType::Put, OpType::Put],
1284 &[21, 22, 23],
1285 );
1286 assert_eq!(
1287 Timestamp::new_millisecond(1),
1288 batch.first_timestamp().unwrap()
1289 );
1290 assert_eq!(
1291 Timestamp::new_millisecond(3),
1292 batch.last_timestamp().unwrap()
1293 );
1294 assert_eq!(11, batch.first_sequence().unwrap());
1295 assert_eq!(13, batch.last_sequence().unwrap());
1296 }
1297
1298 #[test]
1299 fn test_slice() {
1300 let batch = new_batch(
1301 &[1, 2, 3, 4],
1302 &[11, 12, 13, 14],
1303 &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1304 &[21, 22, 23, 24],
1305 );
1306 let batch = batch.slice(1, 2);
1307 let expect = new_batch(
1308 &[2, 3],
1309 &[12, 13],
1310 &[OpType::Delete, OpType::Put],
1311 &[22, 23],
1312 );
1313 assert_eq!(expect, batch);
1314 }
1315
1316 #[test]
1317 fn test_timestamps_native() {
1318 let batch = new_batch(
1319 &[1, 2, 3, 4],
1320 &[11, 12, 13, 14],
1321 &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1322 &[21, 22, 23, 24],
1323 );
1324 assert_eq!(&[1, 2, 3, 4], batch.timestamps_native().unwrap());
1325 }
1326
1327 #[test]
1328 fn test_concat_empty() {
1329 let err = Batch::concat(vec![]).unwrap_err();
1330 assert!(
1331 matches!(err, Error::InvalidBatch { .. }),
1332 "unexpected err: {err}"
1333 );
1334 }
1335
1336 #[test]
1337 fn test_concat_one() {
1338 let batch = new_batch(&[], &[], &[], &[]);
1339 let actual = Batch::concat(vec![batch.clone()]).unwrap();
1340 assert_eq!(batch, actual);
1341
1342 let batch = new_batch(&[1, 2], &[11, 12], &[OpType::Put, OpType::Put], &[21, 22]);
1343 let actual = Batch::concat(vec![batch.clone()]).unwrap();
1344 assert_eq!(batch, actual);
1345 }
1346
1347 #[test]
1348 fn test_concat_multiple() {
1349 let batches = vec![
1350 new_batch(&[1, 2], &[11, 12], &[OpType::Put, OpType::Put], &[21, 22]),
1351 new_batch(
1352 &[3, 4, 5],
1353 &[13, 14, 15],
1354 &[OpType::Put, OpType::Delete, OpType::Put],
1355 &[23, 24, 25],
1356 ),
1357 new_batch(&[], &[], &[], &[]),
1358 new_batch(&[6], &[16], &[OpType::Put], &[26]),
1359 ];
1360 let batch = Batch::concat(batches).unwrap();
1361 let expect = new_batch(
1362 &[1, 2, 3, 4, 5, 6],
1363 &[11, 12, 13, 14, 15, 16],
1364 &[
1365 OpType::Put,
1366 OpType::Put,
1367 OpType::Put,
1368 OpType::Delete,
1369 OpType::Put,
1370 OpType::Put,
1371 ],
1372 &[21, 22, 23, 24, 25, 26],
1373 );
1374 assert_eq!(expect, batch);
1375 }
1376
1377 #[test]
1378 fn test_concat_different() {
1379 let batch1 = new_batch(&[1], &[1], &[OpType::Put], &[1]);
1380 let mut batch2 = new_batch(&[2], &[2], &[OpType::Put], &[2]);
1381 batch2.primary_key = b"hello".to_vec();
1382 let err = Batch::concat(vec![batch1, batch2]).unwrap_err();
1383 assert!(
1384 matches!(err, Error::InvalidBatch { .. }),
1385 "unexpected err: {err}"
1386 );
1387 }
1388
1389 #[test]
1390 fn test_concat_different_fields() {
1391 let batch1 = new_batch(&[1], &[1], &[OpType::Put], &[1]);
1392 let fields = vec![
1393 batch1.fields()[0].clone(),
1394 BatchColumn {
1395 column_id: 2,
1396 data: Arc::new(UInt64Vector::from_slice([2])),
1397 },
1398 ];
1399 let batch2 = batch1.clone().with_fields(fields).unwrap();
1401 let err = Batch::concat(vec![batch1.clone(), batch2]).unwrap_err();
1402 assert!(
1403 matches!(err, Error::InvalidBatch { .. }),
1404 "unexpected err: {err}"
1405 );
1406
1407 let fields = vec![BatchColumn {
1409 column_id: 2,
1410 data: Arc::new(UInt64Vector::from_slice([2])),
1411 }];
1412 let batch2 = batch1.clone().with_fields(fields).unwrap();
1413 let err = Batch::concat(vec![batch1, batch2]).unwrap_err();
1414 assert!(
1415 matches!(err, Error::InvalidBatch { .. }),
1416 "unexpected err: {err}"
1417 );
1418 }
1419
1420 #[test]
1421 fn test_filter_deleted_empty() {
1422 let mut batch = new_batch(&[], &[], &[], &[]);
1423 batch.filter_deleted().unwrap();
1424 assert!(batch.is_empty());
1425 }
1426
1427 #[test]
1428 fn test_filter_deleted() {
1429 let mut batch = new_batch(
1430 &[1, 2, 3, 4],
1431 &[11, 12, 13, 14],
1432 &[OpType::Delete, OpType::Put, OpType::Delete, OpType::Put],
1433 &[21, 22, 23, 24],
1434 );
1435 batch.filter_deleted().unwrap();
1436 let expect = new_batch(&[2, 4], &[12, 14], &[OpType::Put, OpType::Put], &[22, 24]);
1437 assert_eq!(expect, batch);
1438
1439 let mut batch = new_batch(
1440 &[1, 2, 3, 4],
1441 &[11, 12, 13, 14],
1442 &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1443 &[21, 22, 23, 24],
1444 );
1445 let expect = batch.clone();
1446 batch.filter_deleted().unwrap();
1447 assert_eq!(expect, batch);
1448 }
1449
1450 #[test]
1451 fn test_filter_by_sequence() {
1452 let mut batch = new_batch(
1454 &[1, 2, 3, 4],
1455 &[11, 12, 13, 14],
1456 &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1457 &[21, 22, 23, 24],
1458 );
1459 batch
1460 .filter_by_sequence(Some(SequenceRange::LtEq { max: 13 }))
1461 .unwrap();
1462 let expect = new_batch(
1463 &[1, 2, 3],
1464 &[11, 12, 13],
1465 &[OpType::Put, OpType::Put, OpType::Put],
1466 &[21, 22, 23],
1467 );
1468 assert_eq!(expect, batch);
1469
1470 let mut batch = new_batch(
1472 &[1, 2, 3, 4],
1473 &[11, 12, 13, 14],
1474 &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1475 &[21, 22, 23, 24],
1476 );
1477
1478 batch
1479 .filter_by_sequence(Some(SequenceRange::LtEq { max: 10 }))
1480 .unwrap();
1481 assert!(batch.is_empty());
1482
1483 let mut batch = new_batch(
1485 &[1, 2, 3, 4],
1486 &[11, 12, 13, 14],
1487 &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1488 &[21, 22, 23, 24],
1489 );
1490 let expect = batch.clone();
1491 batch.filter_by_sequence(None).unwrap();
1492 assert_eq!(expect, batch);
1493
1494 let mut batch = new_batch(&[], &[], &[], &[]);
1496 batch
1497 .filter_by_sequence(Some(SequenceRange::LtEq { max: 10 }))
1498 .unwrap();
1499 assert!(batch.is_empty());
1500
1501 let mut batch = new_batch(&[], &[], &[], &[]);
1503 batch.filter_by_sequence(None).unwrap();
1504 assert!(batch.is_empty());
1505
1506 let mut batch = new_batch(
1508 &[1, 2, 3, 4],
1509 &[11, 12, 13, 14],
1510 &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1511 &[21, 22, 23, 24],
1512 );
1513 batch
1514 .filter_by_sequence(Some(SequenceRange::Gt { min: 12 }))
1515 .unwrap();
1516 let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1517 assert_eq!(expect, batch);
1518
1519 let mut batch = new_batch(
1521 &[1, 2, 3, 4],
1522 &[11, 12, 13, 14],
1523 &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1524 &[21, 22, 23, 24],
1525 );
1526 batch
1527 .filter_by_sequence(Some(SequenceRange::Gt { min: 20 }))
1528 .unwrap();
1529 assert!(batch.is_empty());
1530
1531 let mut batch = new_batch(
1533 &[1, 2, 3, 4, 5],
1534 &[11, 12, 13, 14, 15],
1535 &[
1536 OpType::Put,
1537 OpType::Put,
1538 OpType::Put,
1539 OpType::Put,
1540 OpType::Put,
1541 ],
1542 &[21, 22, 23, 24, 25],
1543 );
1544 batch
1545 .filter_by_sequence(Some(SequenceRange::GtLtEq { min: 12, max: 14 }))
1546 .unwrap();
1547 let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1548 assert_eq!(expect, batch);
1549
1550 let mut batch = new_batch(
1552 &[1, 2, 3, 4, 5],
1553 &[11, 12, 13, 14, 15],
1554 &[
1555 OpType::Put,
1556 OpType::Delete,
1557 OpType::Put,
1558 OpType::Delete,
1559 OpType::Put,
1560 ],
1561 &[21, 22, 23, 24, 25],
1562 );
1563 batch
1564 .filter_by_sequence(Some(SequenceRange::GtLtEq { min: 11, max: 13 }))
1565 .unwrap();
1566 let expect = new_batch(
1567 &[2, 3],
1568 &[12, 13],
1569 &[OpType::Delete, OpType::Put],
1570 &[22, 23],
1571 );
1572 assert_eq!(expect, batch);
1573
1574 let mut batch = new_batch(
1576 &[1, 2, 3, 4],
1577 &[11, 12, 13, 14],
1578 &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1579 &[21, 22, 23, 24],
1580 );
1581 batch
1582 .filter_by_sequence(Some(SequenceRange::GtLtEq { min: 20, max: 25 }))
1583 .unwrap();
1584 assert!(batch.is_empty());
1585 }
1586
1587 #[test]
1588 fn test_merge_last_non_null_no_dup() {
1589 let mut batch = new_batch_with_u64_fields(
1590 &[1, 2],
1591 &[2, 1],
1592 &[OpType::Put, OpType::Put],
1593 &[(1, &[Some(10), None]), (2, &[Some(100), Some(200)])],
1594 );
1595 let expect = batch.clone();
1596 batch.merge_last_non_null().unwrap();
1597 assert_eq!(expect, batch);
1598 }
1599
1600 #[test]
1601 fn test_merge_last_non_null_fill_null_fields() {
1602 let mut batch = new_batch_with_u64_fields(
1604 &[1, 1, 1],
1605 &[3, 2, 1],
1606 &[OpType::Put, OpType::Put, OpType::Put],
1607 &[
1608 (1, &[None, Some(10), Some(11)]),
1609 (2, &[Some(100), Some(200), Some(300)]),
1610 ],
1611 );
1612 batch.merge_last_non_null().unwrap();
1613
1614 let expect = new_batch_with_u64_fields(
1617 &[1],
1618 &[3],
1619 &[OpType::Put],
1620 &[(1, &[Some(10)]), (2, &[Some(100)])],
1621 );
1622 assert_eq!(expect, batch);
1623 }
1624
1625 #[test]
1626 fn test_merge_last_non_null_stop_at_delete_row() {
1627 let mut batch = new_batch_with_u64_fields(
1630 &[1, 1, 1],
1631 &[3, 2, 1],
1632 &[OpType::Put, OpType::Delete, OpType::Put],
1633 &[
1634 (1, &[None, Some(10), Some(11)]),
1635 (2, &[Some(100), Some(200), Some(300)]),
1636 ],
1637 );
1638 batch.merge_last_non_null().unwrap();
1639
1640 let expect = new_batch_with_u64_fields(
1641 &[1],
1642 &[3],
1643 &[OpType::Put],
1644 &[(1, &[None]), (2, &[Some(100)])],
1645 );
1646 assert_eq!(expect, batch);
1647 }
1648
1649 #[test]
1650 fn test_merge_last_non_null_base_delete_no_merge() {
1651 let mut batch = new_batch_with_u64_fields(
1652 &[1, 1],
1653 &[3, 2],
1654 &[OpType::Delete, OpType::Put],
1655 &[(1, &[None, Some(10)]), (2, &[None, Some(200)])],
1656 );
1657 batch.merge_last_non_null().unwrap();
1658
1659 let expect =
1661 new_batch_with_u64_fields(&[1], &[3], &[OpType::Delete], &[(1, &[None]), (2, &[None])]);
1662 assert_eq!(expect, batch);
1663 }
1664
1665 #[test]
1666 fn test_merge_last_non_null_multiple_timestamp_groups() {
1667 let mut batch = new_batch_with_u64_fields(
1668 &[1, 1, 2, 3, 3],
1669 &[5, 4, 3, 2, 1],
1670 &[
1671 OpType::Put,
1672 OpType::Put,
1673 OpType::Put,
1674 OpType::Put,
1675 OpType::Put,
1676 ],
1677 &[
1678 (1, &[None, Some(10), Some(20), None, Some(30)]),
1679 (2, &[Some(100), Some(110), Some(120), None, Some(130)]),
1680 ],
1681 );
1682 batch.merge_last_non_null().unwrap();
1683
1684 let expect = new_batch_with_u64_fields(
1685 &[1, 2, 3],
1686 &[5, 3, 2],
1687 &[OpType::Put, OpType::Put, OpType::Put],
1688 &[
1689 (1, &[Some(10), Some(20), Some(30)]),
1690 (2, &[Some(100), Some(120), Some(130)]),
1691 ],
1692 );
1693 assert_eq!(expect, batch);
1694 }
1695
1696 #[test]
1697 fn test_merge_last_non_null_no_fields() {
1698 let mut batch = new_batch_without_fields(
1699 &[1, 1, 2],
1700 &[3, 2, 1],
1701 &[OpType::Put, OpType::Put, OpType::Put],
1702 );
1703 batch.merge_last_non_null().unwrap();
1704
1705 let expect = new_batch_without_fields(&[1, 2], &[3, 1], &[OpType::Put, OpType::Put]);
1706 assert_eq!(expect, batch);
1707 }
1708
1709 #[test]
1710 fn test_filter() {
1711 let mut batch = new_batch(
1713 &[1, 2, 3, 4],
1714 &[11, 12, 13, 14],
1715 &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1716 &[21, 22, 23, 24],
1717 );
1718 let predicate = BooleanVector::from_vec(vec![false, false, true, true]);
1719 batch.filter(&predicate).unwrap();
1720 let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1721 assert_eq!(expect, batch);
1722
1723 let mut batch = new_batch(
1725 &[1, 2, 3, 4],
1726 &[11, 12, 13, 14],
1727 &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1728 &[21, 22, 23, 24],
1729 );
1730 let predicate = BooleanVector::from_vec(vec![false, false, true, true]);
1731 batch.filter(&predicate).unwrap();
1732 let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1733 assert_eq!(expect, batch);
1734
1735 let predicate = BooleanVector::from_vec(vec![false, false]);
1737 batch.filter(&predicate).unwrap();
1738 assert!(batch.is_empty());
1739 }
1740
1741 #[test]
1742 fn test_sort_and_dedup() {
1743 let original = new_batch(
1744 &[2, 3, 1, 4, 5, 2],
1745 &[1, 2, 3, 4, 5, 6],
1746 &[
1747 OpType::Put,
1748 OpType::Put,
1749 OpType::Put,
1750 OpType::Put,
1751 OpType::Put,
1752 OpType::Put,
1753 ],
1754 &[21, 22, 23, 24, 25, 26],
1755 );
1756
1757 let mut batch = original.clone();
1758 batch.sort(true).unwrap();
1759 assert_eq!(
1761 new_batch(
1762 &[1, 2, 3, 4, 5],
1763 &[3, 6, 2, 4, 5],
1764 &[
1765 OpType::Put,
1766 OpType::Put,
1767 OpType::Put,
1768 OpType::Put,
1769 OpType::Put,
1770 ],
1771 &[23, 26, 22, 24, 25],
1772 ),
1773 batch
1774 );
1775
1776 let mut batch = original.clone();
1777 batch.sort(false).unwrap();
1778
1779 assert_eq!(
1781 new_batch(
1782 &[1, 2, 2, 3, 4, 5],
1783 &[3, 6, 1, 2, 4, 5],
1784 &[
1785 OpType::Put,
1786 OpType::Put,
1787 OpType::Put,
1788 OpType::Put,
1789 OpType::Put,
1790 OpType::Put,
1791 ],
1792 &[23, 26, 21, 22, 24, 25],
1793 ),
1794 batch
1795 );
1796
1797 let original = new_batch(
1798 &[2, 2, 1],
1799 &[1, 6, 1],
1800 &[OpType::Delete, OpType::Put, OpType::Put],
1801 &[21, 22, 23],
1802 );
1803
1804 let mut batch = original.clone();
1805 batch.sort(true).unwrap();
1806 let expect = new_batch(&[1, 2], &[1, 6], &[OpType::Put, OpType::Put], &[23, 22]);
1807 assert_eq!(expect, batch);
1808
1809 let mut batch = original.clone();
1810 batch.sort(false).unwrap();
1811 let expect = new_batch(
1812 &[1, 2, 2],
1813 &[1, 6, 1],
1814 &[OpType::Put, OpType::Put, OpType::Delete],
1815 &[23, 22, 21],
1816 );
1817 assert_eq!(expect, batch);
1818 }
1819
1820 #[test]
1821 fn test_get_value() {
1822 let encodings = [PrimaryKeyEncoding::Dense, PrimaryKeyEncoding::Sparse];
1823
1824 for encoding in encodings {
1825 let codec = build_primary_key_codec_with_fields(
1826 encoding,
1827 [
1828 (
1829 ReservedColumnId::table_id(),
1830 row_converter::SortField::new(ConcreteDataType::uint32_datatype()),
1831 ),
1832 (
1833 ReservedColumnId::tsid(),
1834 row_converter::SortField::new(ConcreteDataType::uint64_datatype()),
1835 ),
1836 (
1837 100,
1838 row_converter::SortField::new(ConcreteDataType::string_datatype()),
1839 ),
1840 (
1841 200,
1842 row_converter::SortField::new(ConcreteDataType::string_datatype()),
1843 ),
1844 ]
1845 .into_iter(),
1846 );
1847
1848 let values = [
1849 Value::UInt32(1000),
1850 Value::UInt64(2000),
1851 Value::String("abcdefgh".into()),
1852 Value::String("zyxwvu".into()),
1853 ];
1854 let mut buf = vec![];
1855 codec
1856 .encode_values(
1857 &[
1858 (ReservedColumnId::table_id(), values[0].clone()),
1859 (ReservedColumnId::tsid(), values[1].clone()),
1860 (100, values[2].clone()),
1861 (200, values[3].clone()),
1862 ],
1863 &mut buf,
1864 )
1865 .unwrap();
1866
1867 let field_col_id = 2;
1868 let mut batch = new_batch_builder(
1869 &buf,
1870 &[1, 2, 3],
1871 &[1, 1, 1],
1872 &[OpType::Put, OpType::Put, OpType::Put],
1873 field_col_id,
1874 &[42, 43, 44],
1875 )
1876 .build()
1877 .unwrap();
1878
1879 let v = batch
1880 .pk_col_value(&*codec, 0, ReservedColumnId::table_id())
1881 .unwrap()
1882 .unwrap();
1883 assert_eq!(values[0], *v);
1884
1885 let v = batch
1886 .pk_col_value(&*codec, 1, ReservedColumnId::tsid())
1887 .unwrap()
1888 .unwrap();
1889 assert_eq!(values[1], *v);
1890
1891 let v = batch.pk_col_value(&*codec, 2, 100).unwrap().unwrap();
1892 assert_eq!(values[2], *v);
1893
1894 let v = batch.pk_col_value(&*codec, 3, 200).unwrap().unwrap();
1895 assert_eq!(values[3], *v);
1896
1897 let v = batch.field_col_value(field_col_id).unwrap();
1898 assert_eq!(v.data.get(0), Value::UInt64(42));
1899 assert_eq!(v.data.get(1), Value::UInt64(43));
1900 assert_eq!(v.data.get(2), Value::UInt64(44));
1901 }
1902 }
1903}