Skip to main content

mito2/
read.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Common structs and utilities for reading data.
16
17pub mod batch_adapter;
18pub mod compat;
19pub mod dedup;
20pub mod flat_dedup;
21pub mod flat_merge;
22pub mod flat_projection;
23pub mod last_row;
24pub mod projection;
25pub(crate) mod prune;
26pub(crate) mod pruner;
27pub mod range;
28#[cfg(feature = "test")]
29pub mod range_cache;
30#[cfg(not(feature = "test"))]
31pub(crate) mod range_cache;
32pub mod scan_region;
33pub mod scan_util;
34pub(crate) mod seq_scan;
35pub mod series_scan;
36pub mod stream;
37pub(crate) mod unordered_scan;
38
39use std::collections::HashMap;
40use std::sync::Arc;
41use std::time::Duration;
42
43use api::v1::OpType;
44use async_trait::async_trait;
45use common_time::Timestamp;
46use datafusion_common::arrow::array::UInt8Array;
47use datatypes::arrow;
48use datatypes::arrow::array::{Array, ArrayRef};
49use datatypes::arrow::compute::SortOptions;
50use datatypes::arrow::record_batch::RecordBatch;
51use datatypes::arrow::row::{RowConverter, SortField};
52use datatypes::prelude::{ConcreteDataType, DataType, ScalarVector};
53use datatypes::scalars::ScalarVectorBuilder;
54use datatypes::types::TimestampType;
55use datatypes::value::{Value, ValueRef};
56use datatypes::vectors::{
57    BooleanVector, Helper, TimestampMicrosecondVector, TimestampMillisecondVector,
58    TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampSecondVector,
59    UInt8Vector, UInt8VectorBuilder, UInt32Vector, UInt64Vector, UInt64VectorBuilder, Vector,
60    VectorRef,
61};
62use futures::TryStreamExt;
63use futures::stream::BoxStream;
64use mito_codec::row_converter::{CompositeValues, PrimaryKeyCodec};
65use snafu::{OptionExt, ResultExt, ensure};
66use store_api::storage::{ColumnId, SequenceNumber, SequenceRange};
67
68use crate::error::{
69    ComputeArrowSnafu, ComputeVectorSnafu, ConvertVectorSnafu, DecodeSnafu, InvalidBatchSnafu,
70    Result,
71};
72use crate::memtable::{BoxedBatchIterator, BoxedRecordBatchIterator};
73/// Storage internal representation of a batch of rows for a primary key (time series).
74///
75/// Rows are sorted by primary key, timestamp, sequence desc, op_type desc. Fields
76/// always keep the same relative order as fields in [RegionMetadata](store_api::metadata::RegionMetadata).
77#[derive(Debug, PartialEq, Clone)]
78pub struct Batch {
79    /// Primary key encoded in a comparable form.
80    primary_key: Vec<u8>,
81    /// Possibly decoded `primary_key` values. Some places would decode it in advance.
82    pk_values: Option<CompositeValues>,
83    /// Timestamps of rows, should be sorted and not null.
84    timestamps: VectorRef,
85    /// Sequences of rows
86    ///
87    /// UInt64 type, not null.
88    sequences: Arc<UInt64Vector>,
89    /// Op types of rows
90    ///
91    /// UInt8 type, not null.
92    op_types: Arc<UInt8Vector>,
93    /// Fields organized in columnar format.
94    fields: Vec<BatchColumn>,
95    /// Cache for field index lookup.
96    fields_idx: Option<HashMap<ColumnId, usize>>,
97}
98
99impl Batch {
100    /// Creates a new batch.
101    pub fn new(
102        primary_key: Vec<u8>,
103        timestamps: VectorRef,
104        sequences: Arc<UInt64Vector>,
105        op_types: Arc<UInt8Vector>,
106        fields: Vec<BatchColumn>,
107    ) -> Result<Batch> {
108        BatchBuilder::with_required_columns(primary_key, timestamps, sequences, op_types)
109            .with_fields(fields)
110            .build()
111    }
112
113    /// Tries to set fields for the batch.
114    pub fn with_fields(self, fields: Vec<BatchColumn>) -> Result<Batch> {
115        Batch::new(
116            self.primary_key,
117            self.timestamps,
118            self.sequences,
119            self.op_types,
120            fields,
121        )
122    }
123
124    /// Returns primary key of the batch.
125    pub fn primary_key(&self) -> &[u8] {
126        &self.primary_key
127    }
128
129    /// Returns possibly decoded primary-key values.
130    pub fn pk_values(&self) -> Option<&CompositeValues> {
131        self.pk_values.as_ref()
132    }
133
134    /// Sets possibly decoded primary-key values.
135    pub fn set_pk_values(&mut self, pk_values: CompositeValues) {
136        self.pk_values = Some(pk_values);
137    }
138
139    /// Removes possibly decoded primary-key values. For testing only.
140    #[cfg(any(test, feature = "test"))]
141    pub fn remove_pk_values(&mut self) {
142        self.pk_values = None;
143    }
144
145    /// Returns fields in the batch.
146    pub fn fields(&self) -> &[BatchColumn] {
147        &self.fields
148    }
149
150    /// Returns timestamps of the batch.
151    pub fn timestamps(&self) -> &VectorRef {
152        &self.timestamps
153    }
154
155    /// Returns sequences of the batch.
156    pub fn sequences(&self) -> &Arc<UInt64Vector> {
157        &self.sequences
158    }
159
160    /// Returns op types of the batch.
161    pub fn op_types(&self) -> &Arc<UInt8Vector> {
162        &self.op_types
163    }
164
165    /// Returns the number of rows in the batch.
166    pub fn num_rows(&self) -> usize {
167        // All vectors have the same length. We use the length of sequences vector
168        // since it has static type.
169        self.sequences.len()
170    }
171
172    /// Create an empty [`Batch`].
173    #[allow(dead_code)]
174    pub(crate) fn empty() -> Self {
175        Self {
176            primary_key: vec![],
177            pk_values: None,
178            timestamps: Arc::new(TimestampMillisecondVectorBuilder::with_capacity(0).finish()),
179            sequences: Arc::new(UInt64VectorBuilder::with_capacity(0).finish()),
180            op_types: Arc::new(UInt8VectorBuilder::with_capacity(0).finish()),
181            fields: vec![],
182            fields_idx: None,
183        }
184    }
185
186    /// Returns true if the number of rows in the batch is 0.
187    pub fn is_empty(&self) -> bool {
188        self.num_rows() == 0
189    }
190
191    /// Returns the first timestamp in the batch or `None` if the batch is empty.
192    pub fn first_timestamp(&self) -> Option<Timestamp> {
193        if self.timestamps.is_empty() {
194            return None;
195        }
196
197        Some(self.get_timestamp(0))
198    }
199
200    /// Returns the last timestamp in the batch or `None` if the batch is empty.
201    pub fn last_timestamp(&self) -> Option<Timestamp> {
202        if self.timestamps.is_empty() {
203            return None;
204        }
205
206        Some(self.get_timestamp(self.timestamps.len() - 1))
207    }
208
209    /// Returns the first sequence in the batch or `None` if the batch is empty.
210    pub fn first_sequence(&self) -> Option<SequenceNumber> {
211        if self.sequences.is_empty() {
212            return None;
213        }
214
215        Some(self.get_sequence(0))
216    }
217
218    /// Returns the last sequence in the batch or `None` if the batch is empty.
219    pub fn last_sequence(&self) -> Option<SequenceNumber> {
220        if self.sequences.is_empty() {
221            return None;
222        }
223
224        Some(self.get_sequence(self.sequences.len() - 1))
225    }
226
227    /// Replaces the primary key of the batch.
228    ///
229    /// Notice that this [Batch] also contains a maybe-exist `pk_values`.
230    /// Be sure to update that field as well.
231    pub fn set_primary_key(&mut self, primary_key: Vec<u8>) {
232        self.primary_key = primary_key;
233    }
234
235    /// Slice the batch, returning a new batch.
236    ///
237    /// # Panics
238    /// Panics if `offset + length > self.num_rows()`.
239    pub fn slice(&self, offset: usize, length: usize) -> Batch {
240        let fields = self
241            .fields
242            .iter()
243            .map(|column| BatchColumn {
244                column_id: column.column_id,
245                data: column.data.slice(offset, length),
246            })
247            .collect();
248        // We skip using the builder to avoid validating the batch again.
249        Batch {
250            // Now we need to clone the primary key. We could try `Bytes` if
251            // this becomes a bottleneck.
252            primary_key: self.primary_key.clone(),
253            pk_values: self.pk_values.clone(),
254            timestamps: self.timestamps.slice(offset, length),
255            sequences: Arc::new(self.sequences.get_slice(offset, length)),
256            op_types: Arc::new(self.op_types.get_slice(offset, length)),
257            fields,
258            fields_idx: self.fields_idx.clone(),
259        }
260    }
261
262    /// Takes `batches` and concat them into one batch.
263    ///
264    /// All `batches` must have the same primary key.
265    pub fn concat(mut batches: Vec<Batch>) -> Result<Batch> {
266        ensure!(
267            !batches.is_empty(),
268            InvalidBatchSnafu {
269                reason: "empty batches",
270            }
271        );
272        if batches.len() == 1 {
273            // Now we own the `batches` so we could pop it directly.
274            return Ok(batches.pop().unwrap());
275        }
276
277        let primary_key = std::mem::take(&mut batches[0].primary_key);
278        let first = &batches[0];
279        // We took the primary key from the first batch so we don't use `first.primary_key()`.
280        ensure!(
281            batches
282                .iter()
283                .skip(1)
284                .all(|b| b.primary_key() == primary_key),
285            InvalidBatchSnafu {
286                reason: "batches have different primary key",
287            }
288        );
289        for b in batches.iter().skip(1) {
290            ensure!(
291                b.fields.len() == first.fields.len(),
292                InvalidBatchSnafu {
293                    reason: "batches have different field num",
294                }
295            );
296            for (l, r) in b.fields.iter().zip(&first.fields) {
297                ensure!(
298                    l.column_id == r.column_id,
299                    InvalidBatchSnafu {
300                        reason: "batches have different fields",
301                    }
302                );
303            }
304        }
305
306        // We take the primary key from the first batch.
307        let mut builder = BatchBuilder::new(primary_key);
308        // Concat timestamps, sequences, op_types, fields.
309        let array = concat_arrays(batches.iter().map(|b| b.timestamps().to_arrow_array()))?;
310        builder.timestamps_array(array)?;
311        let array = concat_arrays(batches.iter().map(|b| b.sequences().to_arrow_array()))?;
312        builder.sequences_array(array)?;
313        let array = concat_arrays(batches.iter().map(|b| b.op_types().to_arrow_array()))?;
314        builder.op_types_array(array)?;
315        for (i, batch_column) in first.fields.iter().enumerate() {
316            let array = concat_arrays(batches.iter().map(|b| b.fields()[i].data.to_arrow_array()))?;
317            builder.push_field_array(batch_column.column_id, array)?;
318        }
319
320        builder.build()
321    }
322
323    /// Removes rows whose op type is delete.
324    pub fn filter_deleted(&mut self) -> Result<()> {
325        // Safety: op type column is not null.
326        let array = self.op_types.as_arrow();
327        // Find rows with non-delete op type.
328        let rhs = UInt8Array::new_scalar(OpType::Delete as u8);
329        let predicate =
330            arrow::compute::kernels::cmp::neq(array, &rhs).context(ComputeArrowSnafu)?;
331        self.filter(&BooleanVector::from(predicate))
332    }
333
334    // Applies the `predicate` to the batch.
335    // Safety: We know the array type so we unwrap on casting.
336    pub fn filter(&mut self, predicate: &BooleanVector) -> Result<()> {
337        self.timestamps = self
338            .timestamps
339            .filter(predicate)
340            .context(ComputeVectorSnafu)?;
341        self.sequences = Arc::new(
342            UInt64Vector::try_from_arrow_array(
343                arrow::compute::filter(self.sequences.as_arrow(), predicate.as_boolean_array())
344                    .context(ComputeArrowSnafu)?,
345            )
346            .unwrap(),
347        );
348        self.op_types = Arc::new(
349            UInt8Vector::try_from_arrow_array(
350                arrow::compute::filter(self.op_types.as_arrow(), predicate.as_boolean_array())
351                    .context(ComputeArrowSnafu)?,
352            )
353            .unwrap(),
354        );
355        for batch_column in &mut self.fields {
356            batch_column.data = batch_column
357                .data
358                .filter(predicate)
359                .context(ComputeVectorSnafu)?;
360        }
361
362        Ok(())
363    }
364
365    /// Filters rows by the given `sequence`. Only preserves rows with sequence less than or equal to `sequence`.
366    pub fn filter_by_sequence(&mut self, sequence: Option<SequenceRange>) -> Result<()> {
367        let seq_range = match sequence {
368            None => return Ok(()),
369            Some(seq_range) => {
370                let (Some(first), Some(last)) = (self.first_sequence(), self.last_sequence())
371                else {
372                    return Ok(());
373                };
374                let is_subset = match seq_range {
375                    SequenceRange::Gt { min } => min < first,
376                    SequenceRange::LtEq { max } => max >= last,
377                    SequenceRange::GtLtEq { min, max } => min < first && max >= last,
378                };
379                if is_subset {
380                    return Ok(());
381                }
382                seq_range
383            }
384        };
385
386        let seqs = self.sequences.as_arrow();
387        let predicate = seq_range.filter(seqs).context(ComputeArrowSnafu)?;
388
389        let predicate = BooleanVector::from(predicate);
390        self.filter(&predicate)?;
391
392        Ok(())
393    }
394
395    /// Sorts rows in the batch. If `dedup` is true, it also removes
396    /// duplicated rows according to primary keys.
397    ///
398    /// It orders rows by timestamp, sequence desc and only keep the latest
399    /// row for the same timestamp. It doesn't consider op type as sequence
400    /// should already provide uniqueness for a row.
401    pub fn sort(&mut self, dedup: bool) -> Result<()> {
402        // If building a converter each time is costly, we may allow passing a
403        // converter.
404        let converter = RowConverter::new(vec![
405            SortField::new(self.timestamps.data_type().as_arrow_type()),
406            SortField::new_with_options(
407                self.sequences.data_type().as_arrow_type(),
408                SortOptions {
409                    descending: true,
410                    ..Default::default()
411                },
412            ),
413        ])
414        .context(ComputeArrowSnafu)?;
415        // Columns to sort.
416        let columns = [
417            self.timestamps.to_arrow_array(),
418            self.sequences.to_arrow_array(),
419        ];
420        let rows = converter.convert_columns(&columns).unwrap();
421        let mut to_sort: Vec<_> = rows.iter().enumerate().collect();
422
423        let was_sorted = to_sort.is_sorted_by_key(|x| x.1);
424        if !was_sorted {
425            to_sort.sort_unstable_by_key(|x| x.1);
426        }
427
428        let num_rows = to_sort.len();
429        if dedup {
430            // Dedup by timestamps.
431            to_sort.dedup_by(|left, right| {
432                debug_assert_eq!(18, left.1.as_ref().len());
433                debug_assert_eq!(18, right.1.as_ref().len());
434                let (left_key, right_key) = (left.1.as_ref(), right.1.as_ref());
435                // We only compare the timestamp part and ignore sequence.
436                left_key[..TIMESTAMP_KEY_LEN] == right_key[..TIMESTAMP_KEY_LEN]
437            });
438        }
439        let no_dedup = to_sort.len() == num_rows;
440
441        if was_sorted && no_dedup {
442            return Ok(());
443        }
444        let indices = UInt32Vector::from_iter_values(to_sort.iter().map(|v| v.0 as u32));
445        self.take_in_place(&indices)
446    }
447
448    /// Merges duplicated timestamps in the batch by keeping the latest non-null field values.
449    ///
450    /// Rows must already be sorted by timestamp (ascending) and sequence (descending).
451    ///
452    /// This method deduplicates rows with the same timestamp (keeping the first row in each
453    /// timestamp range as the base row) and fills null fields from subsequent rows until all
454    /// fields are filled or a delete operation is encountered.
455    pub(crate) fn merge_last_non_null(&mut self) -> Result<()> {
456        let num_rows = self.num_rows();
457        if num_rows < 2 {
458            return Ok(());
459        }
460
461        let Some(timestamps) = self.timestamps_native() else {
462            return Ok(());
463        };
464
465        // Fast path: check if there are any duplicate timestamps.
466        let mut has_dup = false;
467        let mut group_count = 1;
468        for i in 1..num_rows {
469            has_dup |= timestamps[i] == timestamps[i - 1];
470            group_count += (timestamps[i] != timestamps[i - 1]) as usize;
471        }
472        if !has_dup {
473            return Ok(());
474        }
475
476        let num_fields = self.fields.len();
477        let op_types = self.op_types.as_arrow().values();
478
479        let mut base_indices: Vec<u32> = Vec::with_capacity(group_count);
480        let mut field_indices: Vec<Vec<u32>> = (0..num_fields)
481            .map(|_| Vec::with_capacity(group_count))
482            .collect();
483
484        let mut start = 0;
485        while start < num_rows {
486            let ts = timestamps[start];
487            let mut end = start + 1;
488            while end < num_rows && timestamps[end] == ts {
489                end += 1;
490            }
491
492            let group_pos = base_indices.len();
493            base_indices.push(start as u32);
494
495            if num_fields > 0 {
496                // Default: take the base row for all fields.
497                for idx in &mut field_indices {
498                    idx.push(start as u32);
499                }
500
501                let base_deleted = op_types[start] == OpType::Delete as u8;
502                if !base_deleted {
503                    // Track fields that are null in the base row and try to fill them from older
504                    // rows in the same timestamp range.
505                    let mut missing_fields = Vec::new();
506                    for (field_idx, col) in self.fields.iter().enumerate() {
507                        if col.data.is_null(start) {
508                            missing_fields.push(field_idx);
509                        }
510                    }
511
512                    if !missing_fields.is_empty() {
513                        for row_idx in (start + 1)..end {
514                            if op_types[row_idx] == OpType::Delete as u8 {
515                                break;
516                            }
517
518                            missing_fields.retain(|&field_idx| {
519                                if self.fields[field_idx].data.is_null(row_idx) {
520                                    true
521                                } else {
522                                    field_indices[field_idx][group_pos] = row_idx as u32;
523                                    false
524                                }
525                            });
526
527                            if missing_fields.is_empty() {
528                                break;
529                            }
530                        }
531                    }
532                }
533            }
534
535            start = end;
536        }
537
538        let base_indices = UInt32Vector::from_vec(base_indices);
539        self.timestamps = self
540            .timestamps
541            .take(&base_indices)
542            .context(ComputeVectorSnafu)?;
543        let array = arrow::compute::take(self.sequences.as_arrow(), base_indices.as_arrow(), None)
544            .context(ComputeArrowSnafu)?;
545        // Safety: We know the array and vector type.
546        self.sequences = Arc::new(UInt64Vector::try_from_arrow_array(array).unwrap());
547        let array = arrow::compute::take(self.op_types.as_arrow(), base_indices.as_arrow(), None)
548            .context(ComputeArrowSnafu)?;
549        // Safety: We know the array and vector type.
550        self.op_types = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
551
552        for (field_idx, batch_column) in self.fields.iter_mut().enumerate() {
553            let idx = UInt32Vector::from_vec(std::mem::take(&mut field_indices[field_idx]));
554            batch_column.data = batch_column.data.take(&idx).context(ComputeVectorSnafu)?;
555        }
556
557        Ok(())
558    }
559
560    /// Returns the estimated memory size of the batch.
561    pub fn memory_size(&self) -> usize {
562        let mut size = std::mem::size_of::<Self>();
563        size += self.primary_key.len();
564        size += self.timestamps.memory_size();
565        size += self.sequences.memory_size();
566        size += self.op_types.memory_size();
567        for batch_column in &self.fields {
568            size += batch_column.data.memory_size();
569        }
570        size
571    }
572
573    /// Returns timestamps in a native slice or `None` if the batch is empty.
574    pub(crate) fn timestamps_native(&self) -> Option<&[i64]> {
575        if self.timestamps.is_empty() {
576            return None;
577        }
578
579        let values = match self.timestamps.data_type() {
580            ConcreteDataType::Timestamp(TimestampType::Second(_)) => self
581                .timestamps
582                .as_any()
583                .downcast_ref::<TimestampSecondVector>()
584                .unwrap()
585                .as_arrow()
586                .values(),
587            ConcreteDataType::Timestamp(TimestampType::Millisecond(_)) => self
588                .timestamps
589                .as_any()
590                .downcast_ref::<TimestampMillisecondVector>()
591                .unwrap()
592                .as_arrow()
593                .values(),
594            ConcreteDataType::Timestamp(TimestampType::Microsecond(_)) => self
595                .timestamps
596                .as_any()
597                .downcast_ref::<TimestampMicrosecondVector>()
598                .unwrap()
599                .as_arrow()
600                .values(),
601            ConcreteDataType::Timestamp(TimestampType::Nanosecond(_)) => self
602                .timestamps
603                .as_any()
604                .downcast_ref::<TimestampNanosecondVector>()
605                .unwrap()
606                .as_arrow()
607                .values(),
608            other => panic!("timestamps in a Batch has other type {:?}", other),
609        };
610
611        Some(values)
612    }
613
614    /// Takes the batch in place.
615    fn take_in_place(&mut self, indices: &UInt32Vector) -> Result<()> {
616        self.timestamps = self.timestamps.take(indices).context(ComputeVectorSnafu)?;
617        let array = arrow::compute::take(self.sequences.as_arrow(), indices.as_arrow(), None)
618            .context(ComputeArrowSnafu)?;
619        // Safety: we know the array and vector type.
620        self.sequences = Arc::new(UInt64Vector::try_from_arrow_array(array).unwrap());
621        let array = arrow::compute::take(self.op_types.as_arrow(), indices.as_arrow(), None)
622            .context(ComputeArrowSnafu)?;
623        self.op_types = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
624        for batch_column in &mut self.fields {
625            batch_column.data = batch_column
626                .data
627                .take(indices)
628                .context(ComputeVectorSnafu)?;
629        }
630
631        Ok(())
632    }
633
634    /// Gets a timestamp at given `index`.
635    ///
636    /// # Panics
637    /// Panics if `index` is out-of-bound or the timestamp vector returns null.
638    fn get_timestamp(&self, index: usize) -> Timestamp {
639        match self.timestamps.get_ref(index) {
640            ValueRef::Timestamp(timestamp) => timestamp,
641
642            // We have check the data type is timestamp compatible in the [BatchBuilder] so it's safe to panic.
643            value => panic!("{:?} is not a timestamp", value),
644        }
645    }
646
647    /// Gets a sequence at given `index`.
648    ///
649    /// # Panics
650    /// Panics if `index` is out-of-bound or the sequence vector returns null.
651    pub(crate) fn get_sequence(&self, index: usize) -> SequenceNumber {
652        // Safety: sequences is not null so it actually returns Some.
653        self.sequences.get_data(index).unwrap()
654    }
655
656    /// Checks the batch is monotonic by timestamps.
657    #[cfg(debug_assertions)]
658    #[allow(dead_code)]
659    pub(crate) fn check_monotonic(&self) -> Result<(), String> {
660        use std::cmp::Ordering;
661        if self.timestamps_native().is_none() {
662            return Ok(());
663        }
664
665        let timestamps = self.timestamps_native().unwrap();
666        let sequences = self.sequences.as_arrow().values();
667        for (i, window) in timestamps.windows(2).enumerate() {
668            let current = window[0];
669            let next = window[1];
670            let current_sequence = sequences[i];
671            let next_sequence = sequences[i + 1];
672            match current.cmp(&next) {
673                Ordering::Less => {
674                    // The current timestamp is less than the next timestamp.
675                    continue;
676                }
677                Ordering::Equal => {
678                    // The current timestamp is equal to the next timestamp.
679                    if current_sequence < next_sequence {
680                        return Err(format!(
681                            "sequence are not monotonic: ts {} == {} but current sequence {} < {}, index: {}",
682                            current, next, current_sequence, next_sequence, i
683                        ));
684                    }
685                }
686                Ordering::Greater => {
687                    // The current timestamp is greater than the next timestamp.
688                    return Err(format!(
689                        "timestamps are not monotonic: {} > {}, index: {}",
690                        current, next, i
691                    ));
692                }
693            }
694        }
695
696        Ok(())
697    }
698
699    /// Returns Ok if the given batch is behind the current batch.
700    #[cfg(debug_assertions)]
701    #[allow(dead_code)]
702    pub(crate) fn check_next_batch(&self, other: &Batch) -> Result<(), String> {
703        // Checks the primary key
704        if self.primary_key() < other.primary_key() {
705            return Ok(());
706        }
707        if self.primary_key() > other.primary_key() {
708            return Err(format!(
709                "primary key is not monotonic: {:?} > {:?}",
710                self.primary_key(),
711                other.primary_key()
712            ));
713        }
714        // Checks the timestamp.
715        if self.last_timestamp() < other.first_timestamp() {
716            return Ok(());
717        }
718        if self.last_timestamp() > other.first_timestamp() {
719            return Err(format!(
720                "timestamps are not monotonic: {:?} > {:?}",
721                self.last_timestamp(),
722                other.first_timestamp()
723            ));
724        }
725        // Checks the sequence.
726        if self.last_sequence() >= other.first_sequence() {
727            return Ok(());
728        }
729        Err(format!(
730            "sequences are not monotonic: {:?} < {:?}",
731            self.last_sequence(),
732            other.first_sequence()
733        ))
734    }
735
736    /// Returns the value of the column in the primary key.
737    ///
738    /// Lazily decodes the primary key and caches the result.
739    pub fn pk_col_value(
740        &mut self,
741        codec: &dyn PrimaryKeyCodec,
742        col_idx_in_pk: usize,
743        column_id: ColumnId,
744    ) -> Result<Option<&Value>> {
745        if self.pk_values.is_none() {
746            self.pk_values = Some(codec.decode(&self.primary_key).context(DecodeSnafu)?);
747        }
748
749        let pk_values = self.pk_values.as_ref().unwrap();
750        Ok(match pk_values {
751            CompositeValues::Dense(values) => values.get(col_idx_in_pk).map(|(_, v)| v),
752            CompositeValues::Sparse(values) => values.get(&column_id),
753        })
754    }
755
756    /// Returns values of the field in the batch.
757    ///
758    /// Lazily caches the field index.
759    pub fn field_col_value(&mut self, column_id: ColumnId) -> Option<&BatchColumn> {
760        if self.fields_idx.is_none() {
761            self.fields_idx = Some(
762                self.fields
763                    .iter()
764                    .enumerate()
765                    .map(|(i, c)| (c.column_id, i))
766                    .collect(),
767            );
768        }
769
770        self.fields_idx
771            .as_ref()
772            .unwrap()
773            .get(&column_id)
774            .map(|&idx| &self.fields[idx])
775    }
776}
777
778/// A struct to check the batch is monotonic.
779#[cfg(debug_assertions)]
780#[derive(Default)]
781#[allow(dead_code)]
782pub(crate) struct BatchChecker {
783    last_batch: Option<Batch>,
784    start: Option<Timestamp>,
785    end: Option<Timestamp>,
786}
787
788#[cfg(debug_assertions)]
789#[allow(dead_code)]
790impl BatchChecker {
791    /// Attaches the given start timestamp to the checker.
792    pub(crate) fn with_start(mut self, start: Option<Timestamp>) -> Self {
793        self.start = start;
794        self
795    }
796
797    /// Attaches the given end timestamp to the checker.
798    pub(crate) fn with_end(mut self, end: Option<Timestamp>) -> Self {
799        self.end = end;
800        self
801    }
802
803    /// Returns true if the given batch is monotonic and behind
804    /// the last batch.
805    pub(crate) fn check_monotonic(&mut self, batch: &Batch) -> Result<(), String> {
806        batch.check_monotonic()?;
807
808        if let (Some(start), Some(first)) = (self.start, batch.first_timestamp())
809            && start > first
810        {
811            return Err(format!(
812                "batch's first timestamp is before the start timestamp: {:?} > {:?}",
813                start, first
814            ));
815        }
816        if let (Some(end), Some(last)) = (self.end, batch.last_timestamp())
817            && end <= last
818        {
819            return Err(format!(
820                "batch's last timestamp is after the end timestamp: {:?} <= {:?}",
821                end, last
822            ));
823        }
824
825        // Checks the batch is behind the last batch.
826        // Then Updates the last batch.
827        let res = self
828            .last_batch
829            .as_ref()
830            .map(|last| last.check_next_batch(batch))
831            .unwrap_or(Ok(()));
832        self.last_batch = Some(batch.clone());
833        res
834    }
835
836    /// Formats current batch and last batch for debug.
837    pub(crate) fn format_batch(&self, batch: &Batch) -> String {
838        use std::fmt::Write;
839
840        let mut message = String::new();
841        if let Some(last) = &self.last_batch {
842            write!(
843                message,
844                "last_pk: {:?}, last_ts: {:?}, last_seq: {:?}, ",
845                last.primary_key(),
846                last.last_timestamp(),
847                last.last_sequence()
848            )
849            .unwrap();
850        }
851        write!(
852            message,
853            "batch_pk: {:?}, batch_ts: {:?}, batch_seq: {:?}",
854            batch.primary_key(),
855            batch.timestamps(),
856            batch.sequences()
857        )
858        .unwrap();
859
860        message
861    }
862
863    /// Checks batches from the part range are monotonic. Otherwise, panics.
864    pub(crate) fn ensure_part_range_batch(
865        &mut self,
866        scanner: &str,
867        region_id: store_api::storage::RegionId,
868        partition: usize,
869        part_range: store_api::region_engine::PartitionRange,
870        batch: &Batch,
871    ) {
872        if let Err(e) = self.check_monotonic(batch) {
873            let err_msg = format!(
874                "{}: batch is not sorted, {}, region_id: {}, partition: {}, part_range: {:?}",
875                scanner, e, region_id, partition, part_range,
876            );
877            common_telemetry::error!("{err_msg}, {}", self.format_batch(batch));
878            // Only print the number of row in the panic message.
879            panic!("{err_msg}, batch rows: {}", batch.num_rows());
880        }
881    }
882}
883
884/// Len of timestamp in arrow row format.
885const TIMESTAMP_KEY_LEN: usize = 9;
886
887/// Helper function to concat arrays from `iter`.
888fn concat_arrays(iter: impl Iterator<Item = ArrayRef>) -> Result<ArrayRef> {
889    let arrays: Vec<_> = iter.collect();
890    let dyn_arrays: Vec<_> = arrays.iter().map(|array| array.as_ref()).collect();
891    arrow::compute::concat(&dyn_arrays).context(ComputeArrowSnafu)
892}
893
894/// A column in a [Batch].
895#[derive(Debug, PartialEq, Eq, Clone)]
896pub struct BatchColumn {
897    /// Id of the column.
898    pub column_id: ColumnId,
899    /// Data of the column.
900    pub data: VectorRef,
901}
902
903/// Builder to build [Batch].
904pub struct BatchBuilder {
905    primary_key: Vec<u8>,
906    timestamps: Option<VectorRef>,
907    sequences: Option<Arc<UInt64Vector>>,
908    op_types: Option<Arc<UInt8Vector>>,
909    fields: Vec<BatchColumn>,
910}
911
912impl BatchBuilder {
913    /// Creates a new [BatchBuilder] with primary key.
914    pub fn new(primary_key: Vec<u8>) -> BatchBuilder {
915        BatchBuilder {
916            primary_key,
917            timestamps: None,
918            sequences: None,
919            op_types: None,
920            fields: Vec::new(),
921        }
922    }
923
924    /// Creates a new [BatchBuilder] with all required columns.
925    pub fn with_required_columns(
926        primary_key: Vec<u8>,
927        timestamps: VectorRef,
928        sequences: Arc<UInt64Vector>,
929        op_types: Arc<UInt8Vector>,
930    ) -> BatchBuilder {
931        BatchBuilder {
932            primary_key,
933            timestamps: Some(timestamps),
934            sequences: Some(sequences),
935            op_types: Some(op_types),
936            fields: Vec::new(),
937        }
938    }
939
940    /// Set all field columns.
941    pub fn with_fields(mut self, fields: Vec<BatchColumn>) -> Self {
942        self.fields = fields;
943        self
944    }
945
946    /// Push a field column.
947    pub fn push_field(&mut self, column: BatchColumn) -> &mut Self {
948        self.fields.push(column);
949        self
950    }
951
952    /// Push an array as a field.
953    pub fn push_field_array(&mut self, column_id: ColumnId, array: ArrayRef) -> Result<&mut Self> {
954        let vector = Helper::try_into_vector(array).context(ConvertVectorSnafu)?;
955        self.fields.push(BatchColumn {
956            column_id,
957            data: vector,
958        });
959
960        Ok(self)
961    }
962
963    /// Try to set an array as timestamps.
964    pub fn timestamps_array(&mut self, array: ArrayRef) -> Result<&mut Self> {
965        let vector = Helper::try_into_vector(array).context(ConvertVectorSnafu)?;
966        ensure!(
967            vector.data_type().is_timestamp(),
968            InvalidBatchSnafu {
969                reason: format!("{:?} is not a timestamp type", vector.data_type()),
970            }
971        );
972
973        self.timestamps = Some(vector);
974        Ok(self)
975    }
976
977    /// Try to set an array as sequences.
978    pub fn sequences_array(&mut self, array: ArrayRef) -> Result<&mut Self> {
979        ensure!(
980            *array.data_type() == arrow::datatypes::DataType::UInt64,
981            InvalidBatchSnafu {
982                reason: "sequence array is not UInt64 type",
983            }
984        );
985        // Safety: The cast must success as we have ensured it is uint64 type.
986        let vector = Arc::new(UInt64Vector::try_from_arrow_array(array).unwrap());
987        self.sequences = Some(vector);
988
989        Ok(self)
990    }
991
992    /// Try to set an array as op types.
993    pub fn op_types_array(&mut self, array: ArrayRef) -> Result<&mut Self> {
994        ensure!(
995            *array.data_type() == arrow::datatypes::DataType::UInt8,
996            InvalidBatchSnafu {
997                reason: "sequence array is not UInt8 type",
998            }
999        );
1000        // Safety: The cast must success as we have ensured it is uint64 type.
1001        let vector = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
1002        self.op_types = Some(vector);
1003
1004        Ok(self)
1005    }
1006
1007    /// Builds the [Batch].
1008    pub fn build(self) -> Result<Batch> {
1009        let timestamps = self.timestamps.context(InvalidBatchSnafu {
1010            reason: "missing timestamps",
1011        })?;
1012        let sequences = self.sequences.context(InvalidBatchSnafu {
1013            reason: "missing sequences",
1014        })?;
1015        let op_types = self.op_types.context(InvalidBatchSnafu {
1016            reason: "missing op_types",
1017        })?;
1018        // Our storage format ensure these columns are not nullable so
1019        // we use assert here.
1020        assert_eq!(0, timestamps.null_count());
1021        assert_eq!(0, sequences.null_count());
1022        assert_eq!(0, op_types.null_count());
1023
1024        let ts_len = timestamps.len();
1025        ensure!(
1026            sequences.len() == ts_len,
1027            InvalidBatchSnafu {
1028                reason: format!(
1029                    "sequence have different len {} != {}",
1030                    sequences.len(),
1031                    ts_len
1032                ),
1033            }
1034        );
1035        ensure!(
1036            op_types.len() == ts_len,
1037            InvalidBatchSnafu {
1038                reason: format!(
1039                    "op type have different len {} != {}",
1040                    op_types.len(),
1041                    ts_len
1042                ),
1043            }
1044        );
1045        for column in &self.fields {
1046            ensure!(
1047                column.data.len() == ts_len,
1048                InvalidBatchSnafu {
1049                    reason: format!(
1050                        "column {} has different len {} != {}",
1051                        column.column_id,
1052                        column.data.len(),
1053                        ts_len
1054                    ),
1055                }
1056            );
1057        }
1058
1059        Ok(Batch {
1060            primary_key: self.primary_key,
1061            pk_values: None,
1062            timestamps,
1063            sequences,
1064            op_types,
1065            fields: self.fields,
1066            fields_idx: None,
1067        })
1068    }
1069}
1070
1071impl From<Batch> for BatchBuilder {
1072    fn from(batch: Batch) -> Self {
1073        Self {
1074            primary_key: batch.primary_key,
1075            timestamps: Some(batch.timestamps),
1076            sequences: Some(batch.sequences),
1077            op_types: Some(batch.op_types),
1078            fields: batch.fields,
1079        }
1080    }
1081}
1082
1083/// Async [Batch] reader and iterator wrapper.
1084///
1085/// This is the data source for SST writers or internal readers.
1086pub enum Source {
1087    /// Source from a [BoxedBatchReader].
1088    Reader(BoxedBatchReader),
1089    /// Source from a [BoxedBatchIterator].
1090    Iter(BoxedBatchIterator),
1091    /// Source from a [BoxedBatchStream].
1092    Stream(BoxedBatchStream),
1093}
1094
1095impl Source {
1096    /// Returns next [Batch] from this data source.
1097    pub async fn next_batch(&mut self) -> Result<Option<Batch>> {
1098        match self {
1099            Source::Reader(reader) => reader.next_batch().await,
1100            Source::Iter(iter) => iter.next().transpose(),
1101            Source::Stream(stream) => stream.try_next().await,
1102        }
1103    }
1104}
1105
1106/// Async [RecordBatch] reader and iterator wrapper for flat format.
1107pub enum FlatSource {
1108    /// Source from a [BoxedRecordBatchIterator].
1109    Iter(BoxedRecordBatchIterator),
1110    /// Source from a [BoxedRecordBatchStream].
1111    Stream(BoxedRecordBatchStream),
1112}
1113
1114impl FlatSource {
1115    /// Returns next [RecordBatch] from this data source.
1116    pub async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
1117        match self {
1118            FlatSource::Iter(iter) => iter.next().transpose(),
1119            FlatSource::Stream(stream) => stream.try_next().await,
1120        }
1121    }
1122}
1123
1124/// Async batch reader.
1125///
1126/// The reader must guarantee [Batch]es returned by it have the same schema.
1127#[async_trait]
1128pub trait BatchReader: Send {
1129    /// Fetch next [Batch].
1130    ///
1131    /// Returns `Ok(None)` when the reader has reached its end and calling `next_batch()`
1132    /// again won't return batch again.
1133    ///
1134    /// If `Err` is returned, caller should not call this method again, the implementor
1135    /// may or may not panic in such case.
1136    async fn next_batch(&mut self) -> Result<Option<Batch>>;
1137}
1138
1139/// Pointer to [BatchReader].
1140pub type BoxedBatchReader = Box<dyn BatchReader>;
1141
1142/// Pointer to a stream that yields [Batch].
1143pub type BoxedBatchStream = BoxStream<'static, Result<Batch>>;
1144
1145/// Pointer to a stream that yields [RecordBatch].
1146pub type BoxedRecordBatchStream = BoxStream<'static, Result<RecordBatch>>;
1147
1148#[async_trait::async_trait]
1149impl<T: BatchReader + ?Sized> BatchReader for Box<T> {
1150    async fn next_batch(&mut self) -> Result<Option<Batch>> {
1151        (**self).next_batch().await
1152    }
1153}
1154
1155/// Local metrics for scanners.
1156#[derive(Debug, Default)]
1157pub(crate) struct ScannerMetrics {
1158    /// Duration to scan data.
1159    scan_cost: Duration,
1160    /// Duration while waiting for `yield`.
1161    yield_cost: Duration,
1162    /// Number of batches returned.
1163    num_batches: usize,
1164    /// Number of rows returned.
1165    num_rows: usize,
1166}
1167
1168#[cfg(test)]
1169mod tests {
1170    use datatypes::arrow::array::{TimestampMillisecondArray, UInt8Array, UInt64Array};
1171    use mito_codec::row_converter::{self, build_primary_key_codec_with_fields};
1172    use store_api::codec::PrimaryKeyEncoding;
1173    use store_api::storage::consts::ReservedColumnId;
1174
1175    use super::*;
1176    use crate::error::Error;
1177    use crate::test_util::new_batch_builder;
1178
1179    fn new_batch(
1180        timestamps: &[i64],
1181        sequences: &[u64],
1182        op_types: &[OpType],
1183        field: &[u64],
1184    ) -> Batch {
1185        new_batch_builder(b"test", timestamps, sequences, op_types, 1, field)
1186            .build()
1187            .unwrap()
1188    }
1189
1190    fn new_batch_with_u64_fields(
1191        timestamps: &[i64],
1192        sequences: &[u64],
1193        op_types: &[OpType],
1194        fields: &[(ColumnId, &[Option<u64>])],
1195    ) -> Batch {
1196        assert_eq!(timestamps.len(), sequences.len());
1197        assert_eq!(timestamps.len(), op_types.len());
1198        for (_, values) in fields {
1199            assert_eq!(timestamps.len(), values.len());
1200        }
1201
1202        let mut builder = BatchBuilder::new(b"test".to_vec());
1203        builder
1204            .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values(
1205                timestamps.iter().copied(),
1206            )))
1207            .unwrap()
1208            .sequences_array(Arc::new(UInt64Array::from_iter_values(
1209                sequences.iter().copied(),
1210            )))
1211            .unwrap()
1212            .op_types_array(Arc::new(UInt8Array::from_iter_values(
1213                op_types.iter().map(|v| *v as u8),
1214            )))
1215            .unwrap();
1216
1217        for (col_id, values) in fields {
1218            builder
1219                .push_field_array(*col_id, Arc::new(UInt64Array::from(values.to_vec())))
1220                .unwrap();
1221        }
1222
1223        builder.build().unwrap()
1224    }
1225
1226    fn new_batch_without_fields(
1227        timestamps: &[i64],
1228        sequences: &[u64],
1229        op_types: &[OpType],
1230    ) -> Batch {
1231        assert_eq!(timestamps.len(), sequences.len());
1232        assert_eq!(timestamps.len(), op_types.len());
1233
1234        let mut builder = BatchBuilder::new(b"test".to_vec());
1235        builder
1236            .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values(
1237                timestamps.iter().copied(),
1238            )))
1239            .unwrap()
1240            .sequences_array(Arc::new(UInt64Array::from_iter_values(
1241                sequences.iter().copied(),
1242            )))
1243            .unwrap()
1244            .op_types_array(Arc::new(UInt8Array::from_iter_values(
1245                op_types.iter().map(|v| *v as u8),
1246            )))
1247            .unwrap();
1248
1249        builder.build().unwrap()
1250    }
1251
1252    #[test]
1253    fn test_empty_batch() {
1254        let batch = Batch::empty();
1255        assert!(batch.is_empty());
1256        assert_eq!(None, batch.first_timestamp());
1257        assert_eq!(None, batch.last_timestamp());
1258        assert_eq!(None, batch.first_sequence());
1259        assert_eq!(None, batch.last_sequence());
1260        assert!(batch.timestamps_native().is_none());
1261    }
1262
1263    #[test]
1264    fn test_first_last_one() {
1265        let batch = new_batch(&[1], &[2], &[OpType::Put], &[4]);
1266        assert_eq!(
1267            Timestamp::new_millisecond(1),
1268            batch.first_timestamp().unwrap()
1269        );
1270        assert_eq!(
1271            Timestamp::new_millisecond(1),
1272            batch.last_timestamp().unwrap()
1273        );
1274        assert_eq!(2, batch.first_sequence().unwrap());
1275        assert_eq!(2, batch.last_sequence().unwrap());
1276    }
1277
1278    #[test]
1279    fn test_first_last_multiple() {
1280        let batch = new_batch(
1281            &[1, 2, 3],
1282            &[11, 12, 13],
1283            &[OpType::Put, OpType::Put, OpType::Put],
1284            &[21, 22, 23],
1285        );
1286        assert_eq!(
1287            Timestamp::new_millisecond(1),
1288            batch.first_timestamp().unwrap()
1289        );
1290        assert_eq!(
1291            Timestamp::new_millisecond(3),
1292            batch.last_timestamp().unwrap()
1293        );
1294        assert_eq!(11, batch.first_sequence().unwrap());
1295        assert_eq!(13, batch.last_sequence().unwrap());
1296    }
1297
1298    #[test]
1299    fn test_slice() {
1300        let batch = new_batch(
1301            &[1, 2, 3, 4],
1302            &[11, 12, 13, 14],
1303            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1304            &[21, 22, 23, 24],
1305        );
1306        let batch = batch.slice(1, 2);
1307        let expect = new_batch(
1308            &[2, 3],
1309            &[12, 13],
1310            &[OpType::Delete, OpType::Put],
1311            &[22, 23],
1312        );
1313        assert_eq!(expect, batch);
1314    }
1315
1316    #[test]
1317    fn test_timestamps_native() {
1318        let batch = new_batch(
1319            &[1, 2, 3, 4],
1320            &[11, 12, 13, 14],
1321            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1322            &[21, 22, 23, 24],
1323        );
1324        assert_eq!(&[1, 2, 3, 4], batch.timestamps_native().unwrap());
1325    }
1326
1327    #[test]
1328    fn test_concat_empty() {
1329        let err = Batch::concat(vec![]).unwrap_err();
1330        assert!(
1331            matches!(err, Error::InvalidBatch { .. }),
1332            "unexpected err: {err}"
1333        );
1334    }
1335
1336    #[test]
1337    fn test_concat_one() {
1338        let batch = new_batch(&[], &[], &[], &[]);
1339        let actual = Batch::concat(vec![batch.clone()]).unwrap();
1340        assert_eq!(batch, actual);
1341
1342        let batch = new_batch(&[1, 2], &[11, 12], &[OpType::Put, OpType::Put], &[21, 22]);
1343        let actual = Batch::concat(vec![batch.clone()]).unwrap();
1344        assert_eq!(batch, actual);
1345    }
1346
1347    #[test]
1348    fn test_concat_multiple() {
1349        let batches = vec![
1350            new_batch(&[1, 2], &[11, 12], &[OpType::Put, OpType::Put], &[21, 22]),
1351            new_batch(
1352                &[3, 4, 5],
1353                &[13, 14, 15],
1354                &[OpType::Put, OpType::Delete, OpType::Put],
1355                &[23, 24, 25],
1356            ),
1357            new_batch(&[], &[], &[], &[]),
1358            new_batch(&[6], &[16], &[OpType::Put], &[26]),
1359        ];
1360        let batch = Batch::concat(batches).unwrap();
1361        let expect = new_batch(
1362            &[1, 2, 3, 4, 5, 6],
1363            &[11, 12, 13, 14, 15, 16],
1364            &[
1365                OpType::Put,
1366                OpType::Put,
1367                OpType::Put,
1368                OpType::Delete,
1369                OpType::Put,
1370                OpType::Put,
1371            ],
1372            &[21, 22, 23, 24, 25, 26],
1373        );
1374        assert_eq!(expect, batch);
1375    }
1376
1377    #[test]
1378    fn test_concat_different() {
1379        let batch1 = new_batch(&[1], &[1], &[OpType::Put], &[1]);
1380        let mut batch2 = new_batch(&[2], &[2], &[OpType::Put], &[2]);
1381        batch2.primary_key = b"hello".to_vec();
1382        let err = Batch::concat(vec![batch1, batch2]).unwrap_err();
1383        assert!(
1384            matches!(err, Error::InvalidBatch { .. }),
1385            "unexpected err: {err}"
1386        );
1387    }
1388
1389    #[test]
1390    fn test_concat_different_fields() {
1391        let batch1 = new_batch(&[1], &[1], &[OpType::Put], &[1]);
1392        let fields = vec![
1393            batch1.fields()[0].clone(),
1394            BatchColumn {
1395                column_id: 2,
1396                data: Arc::new(UInt64Vector::from_slice([2])),
1397            },
1398        ];
1399        // Batch 2 has more fields.
1400        let batch2 = batch1.clone().with_fields(fields).unwrap();
1401        let err = Batch::concat(vec![batch1.clone(), batch2]).unwrap_err();
1402        assert!(
1403            matches!(err, Error::InvalidBatch { .. }),
1404            "unexpected err: {err}"
1405        );
1406
1407        // Batch 2 has different field.
1408        let fields = vec![BatchColumn {
1409            column_id: 2,
1410            data: Arc::new(UInt64Vector::from_slice([2])),
1411        }];
1412        let batch2 = batch1.clone().with_fields(fields).unwrap();
1413        let err = Batch::concat(vec![batch1, batch2]).unwrap_err();
1414        assert!(
1415            matches!(err, Error::InvalidBatch { .. }),
1416            "unexpected err: {err}"
1417        );
1418    }
1419
1420    #[test]
1421    fn test_filter_deleted_empty() {
1422        let mut batch = new_batch(&[], &[], &[], &[]);
1423        batch.filter_deleted().unwrap();
1424        assert!(batch.is_empty());
1425    }
1426
1427    #[test]
1428    fn test_filter_deleted() {
1429        let mut batch = new_batch(
1430            &[1, 2, 3, 4],
1431            &[11, 12, 13, 14],
1432            &[OpType::Delete, OpType::Put, OpType::Delete, OpType::Put],
1433            &[21, 22, 23, 24],
1434        );
1435        batch.filter_deleted().unwrap();
1436        let expect = new_batch(&[2, 4], &[12, 14], &[OpType::Put, OpType::Put], &[22, 24]);
1437        assert_eq!(expect, batch);
1438
1439        let mut batch = new_batch(
1440            &[1, 2, 3, 4],
1441            &[11, 12, 13, 14],
1442            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1443            &[21, 22, 23, 24],
1444        );
1445        let expect = batch.clone();
1446        batch.filter_deleted().unwrap();
1447        assert_eq!(expect, batch);
1448    }
1449
1450    #[test]
1451    fn test_filter_by_sequence() {
1452        // Filters put only.
1453        let mut batch = new_batch(
1454            &[1, 2, 3, 4],
1455            &[11, 12, 13, 14],
1456            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1457            &[21, 22, 23, 24],
1458        );
1459        batch
1460            .filter_by_sequence(Some(SequenceRange::LtEq { max: 13 }))
1461            .unwrap();
1462        let expect = new_batch(
1463            &[1, 2, 3],
1464            &[11, 12, 13],
1465            &[OpType::Put, OpType::Put, OpType::Put],
1466            &[21, 22, 23],
1467        );
1468        assert_eq!(expect, batch);
1469
1470        // Filters to empty.
1471        let mut batch = new_batch(
1472            &[1, 2, 3, 4],
1473            &[11, 12, 13, 14],
1474            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1475            &[21, 22, 23, 24],
1476        );
1477
1478        batch
1479            .filter_by_sequence(Some(SequenceRange::LtEq { max: 10 }))
1480            .unwrap();
1481        assert!(batch.is_empty());
1482
1483        // None filter.
1484        let mut batch = new_batch(
1485            &[1, 2, 3, 4],
1486            &[11, 12, 13, 14],
1487            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1488            &[21, 22, 23, 24],
1489        );
1490        let expect = batch.clone();
1491        batch.filter_by_sequence(None).unwrap();
1492        assert_eq!(expect, batch);
1493
1494        // Filter a empty batch
1495        let mut batch = new_batch(&[], &[], &[], &[]);
1496        batch
1497            .filter_by_sequence(Some(SequenceRange::LtEq { max: 10 }))
1498            .unwrap();
1499        assert!(batch.is_empty());
1500
1501        // Filter a empty batch with None
1502        let mut batch = new_batch(&[], &[], &[], &[]);
1503        batch.filter_by_sequence(None).unwrap();
1504        assert!(batch.is_empty());
1505
1506        // Test From variant - exclusive lower bound
1507        let mut batch = new_batch(
1508            &[1, 2, 3, 4],
1509            &[11, 12, 13, 14],
1510            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1511            &[21, 22, 23, 24],
1512        );
1513        batch
1514            .filter_by_sequence(Some(SequenceRange::Gt { min: 12 }))
1515            .unwrap();
1516        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1517        assert_eq!(expect, batch);
1518
1519        // Test From variant with no matches
1520        let mut batch = new_batch(
1521            &[1, 2, 3, 4],
1522            &[11, 12, 13, 14],
1523            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1524            &[21, 22, 23, 24],
1525        );
1526        batch
1527            .filter_by_sequence(Some(SequenceRange::Gt { min: 20 }))
1528            .unwrap();
1529        assert!(batch.is_empty());
1530
1531        // Test Range variant - exclusive lower bound, inclusive upper bound
1532        let mut batch = new_batch(
1533            &[1, 2, 3, 4, 5],
1534            &[11, 12, 13, 14, 15],
1535            &[
1536                OpType::Put,
1537                OpType::Put,
1538                OpType::Put,
1539                OpType::Put,
1540                OpType::Put,
1541            ],
1542            &[21, 22, 23, 24, 25],
1543        );
1544        batch
1545            .filter_by_sequence(Some(SequenceRange::GtLtEq { min: 12, max: 14 }))
1546            .unwrap();
1547        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1548        assert_eq!(expect, batch);
1549
1550        // Test Range variant with mixed operations
1551        let mut batch = new_batch(
1552            &[1, 2, 3, 4, 5],
1553            &[11, 12, 13, 14, 15],
1554            &[
1555                OpType::Put,
1556                OpType::Delete,
1557                OpType::Put,
1558                OpType::Delete,
1559                OpType::Put,
1560            ],
1561            &[21, 22, 23, 24, 25],
1562        );
1563        batch
1564            .filter_by_sequence(Some(SequenceRange::GtLtEq { min: 11, max: 13 }))
1565            .unwrap();
1566        let expect = new_batch(
1567            &[2, 3],
1568            &[12, 13],
1569            &[OpType::Delete, OpType::Put],
1570            &[22, 23],
1571        );
1572        assert_eq!(expect, batch);
1573
1574        // Test Range variant with no matches
1575        let mut batch = new_batch(
1576            &[1, 2, 3, 4],
1577            &[11, 12, 13, 14],
1578            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1579            &[21, 22, 23, 24],
1580        );
1581        batch
1582            .filter_by_sequence(Some(SequenceRange::GtLtEq { min: 20, max: 25 }))
1583            .unwrap();
1584        assert!(batch.is_empty());
1585    }
1586
1587    #[test]
1588    fn test_merge_last_non_null_no_dup() {
1589        let mut batch = new_batch_with_u64_fields(
1590            &[1, 2],
1591            &[2, 1],
1592            &[OpType::Put, OpType::Put],
1593            &[(1, &[Some(10), None]), (2, &[Some(100), Some(200)])],
1594        );
1595        let expect = batch.clone();
1596        batch.merge_last_non_null().unwrap();
1597        assert_eq!(expect, batch);
1598    }
1599
1600    #[test]
1601    fn test_merge_last_non_null_fill_null_fields() {
1602        // Rows are already sorted by timestamp asc and sequence desc.
1603        let mut batch = new_batch_with_u64_fields(
1604            &[1, 1, 1],
1605            &[3, 2, 1],
1606            &[OpType::Put, OpType::Put, OpType::Put],
1607            &[
1608                (1, &[None, Some(10), Some(11)]),
1609                (2, &[Some(100), Some(200), Some(300)]),
1610            ],
1611        );
1612        batch.merge_last_non_null().unwrap();
1613
1614        // Field 1 is filled from the first older row (seq=2). Field 2 keeps the base value.
1615        // Filled fields must not be overwritten by even older duplicates.
1616        let expect = new_batch_with_u64_fields(
1617            &[1],
1618            &[3],
1619            &[OpType::Put],
1620            &[(1, &[Some(10)]), (2, &[Some(100)])],
1621        );
1622        assert_eq!(expect, batch);
1623    }
1624
1625    #[test]
1626    fn test_merge_last_non_null_stop_at_delete_row() {
1627        // A delete row in older duplicates should stop filling to avoid resurrecting values before
1628        // deletion.
1629        let mut batch = new_batch_with_u64_fields(
1630            &[1, 1, 1],
1631            &[3, 2, 1],
1632            &[OpType::Put, OpType::Delete, OpType::Put],
1633            &[
1634                (1, &[None, Some(10), Some(11)]),
1635                (2, &[Some(100), Some(200), Some(300)]),
1636            ],
1637        );
1638        batch.merge_last_non_null().unwrap();
1639
1640        let expect = new_batch_with_u64_fields(
1641            &[1],
1642            &[3],
1643            &[OpType::Put],
1644            &[(1, &[None]), (2, &[Some(100)])],
1645        );
1646        assert_eq!(expect, batch);
1647    }
1648
1649    #[test]
1650    fn test_merge_last_non_null_base_delete_no_merge() {
1651        let mut batch = new_batch_with_u64_fields(
1652            &[1, 1],
1653            &[3, 2],
1654            &[OpType::Delete, OpType::Put],
1655            &[(1, &[None, Some(10)]), (2, &[None, Some(200)])],
1656        );
1657        batch.merge_last_non_null().unwrap();
1658
1659        // Base row is delete, keep it as is and don't merge fields from older rows.
1660        let expect =
1661            new_batch_with_u64_fields(&[1], &[3], &[OpType::Delete], &[(1, &[None]), (2, &[None])]);
1662        assert_eq!(expect, batch);
1663    }
1664
1665    #[test]
1666    fn test_merge_last_non_null_multiple_timestamp_groups() {
1667        let mut batch = new_batch_with_u64_fields(
1668            &[1, 1, 2, 3, 3],
1669            &[5, 4, 3, 2, 1],
1670            &[
1671                OpType::Put,
1672                OpType::Put,
1673                OpType::Put,
1674                OpType::Put,
1675                OpType::Put,
1676            ],
1677            &[
1678                (1, &[None, Some(10), Some(20), None, Some(30)]),
1679                (2, &[Some(100), Some(110), Some(120), None, Some(130)]),
1680            ],
1681        );
1682        batch.merge_last_non_null().unwrap();
1683
1684        let expect = new_batch_with_u64_fields(
1685            &[1, 2, 3],
1686            &[5, 3, 2],
1687            &[OpType::Put, OpType::Put, OpType::Put],
1688            &[
1689                (1, &[Some(10), Some(20), Some(30)]),
1690                (2, &[Some(100), Some(120), Some(130)]),
1691            ],
1692        );
1693        assert_eq!(expect, batch);
1694    }
1695
1696    #[test]
1697    fn test_merge_last_non_null_no_fields() {
1698        let mut batch = new_batch_without_fields(
1699            &[1, 1, 2],
1700            &[3, 2, 1],
1701            &[OpType::Put, OpType::Put, OpType::Put],
1702        );
1703        batch.merge_last_non_null().unwrap();
1704
1705        let expect = new_batch_without_fields(&[1, 2], &[3, 1], &[OpType::Put, OpType::Put]);
1706        assert_eq!(expect, batch);
1707    }
1708
1709    #[test]
1710    fn test_filter() {
1711        // Filters put only.
1712        let mut batch = new_batch(
1713            &[1, 2, 3, 4],
1714            &[11, 12, 13, 14],
1715            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1716            &[21, 22, 23, 24],
1717        );
1718        let predicate = BooleanVector::from_vec(vec![false, false, true, true]);
1719        batch.filter(&predicate).unwrap();
1720        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1721        assert_eq!(expect, batch);
1722
1723        // Filters deletion.
1724        let mut batch = new_batch(
1725            &[1, 2, 3, 4],
1726            &[11, 12, 13, 14],
1727            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1728            &[21, 22, 23, 24],
1729        );
1730        let predicate = BooleanVector::from_vec(vec![false, false, true, true]);
1731        batch.filter(&predicate).unwrap();
1732        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1733        assert_eq!(expect, batch);
1734
1735        // Filters to empty.
1736        let predicate = BooleanVector::from_vec(vec![false, false]);
1737        batch.filter(&predicate).unwrap();
1738        assert!(batch.is_empty());
1739    }
1740
1741    #[test]
1742    fn test_sort_and_dedup() {
1743        let original = new_batch(
1744            &[2, 3, 1, 4, 5, 2],
1745            &[1, 2, 3, 4, 5, 6],
1746            &[
1747                OpType::Put,
1748                OpType::Put,
1749                OpType::Put,
1750                OpType::Put,
1751                OpType::Put,
1752                OpType::Put,
1753            ],
1754            &[21, 22, 23, 24, 25, 26],
1755        );
1756
1757        let mut batch = original.clone();
1758        batch.sort(true).unwrap();
1759        // It should only keep one timestamp 2.
1760        assert_eq!(
1761            new_batch(
1762                &[1, 2, 3, 4, 5],
1763                &[3, 6, 2, 4, 5],
1764                &[
1765                    OpType::Put,
1766                    OpType::Put,
1767                    OpType::Put,
1768                    OpType::Put,
1769                    OpType::Put,
1770                ],
1771                &[23, 26, 22, 24, 25],
1772            ),
1773            batch
1774        );
1775
1776        let mut batch = original.clone();
1777        batch.sort(false).unwrap();
1778
1779        // It should only keep one timestamp 2.
1780        assert_eq!(
1781            new_batch(
1782                &[1, 2, 2, 3, 4, 5],
1783                &[3, 6, 1, 2, 4, 5],
1784                &[
1785                    OpType::Put,
1786                    OpType::Put,
1787                    OpType::Put,
1788                    OpType::Put,
1789                    OpType::Put,
1790                    OpType::Put,
1791                ],
1792                &[23, 26, 21, 22, 24, 25],
1793            ),
1794            batch
1795        );
1796
1797        let original = new_batch(
1798            &[2, 2, 1],
1799            &[1, 6, 1],
1800            &[OpType::Delete, OpType::Put, OpType::Put],
1801            &[21, 22, 23],
1802        );
1803
1804        let mut batch = original.clone();
1805        batch.sort(true).unwrap();
1806        let expect = new_batch(&[1, 2], &[1, 6], &[OpType::Put, OpType::Put], &[23, 22]);
1807        assert_eq!(expect, batch);
1808
1809        let mut batch = original.clone();
1810        batch.sort(false).unwrap();
1811        let expect = new_batch(
1812            &[1, 2, 2],
1813            &[1, 6, 1],
1814            &[OpType::Put, OpType::Put, OpType::Delete],
1815            &[23, 22, 21],
1816        );
1817        assert_eq!(expect, batch);
1818    }
1819
1820    #[test]
1821    fn test_get_value() {
1822        let encodings = [PrimaryKeyEncoding::Dense, PrimaryKeyEncoding::Sparse];
1823
1824        for encoding in encodings {
1825            let codec = build_primary_key_codec_with_fields(
1826                encoding,
1827                [
1828                    (
1829                        ReservedColumnId::table_id(),
1830                        row_converter::SortField::new(ConcreteDataType::uint32_datatype()),
1831                    ),
1832                    (
1833                        ReservedColumnId::tsid(),
1834                        row_converter::SortField::new(ConcreteDataType::uint64_datatype()),
1835                    ),
1836                    (
1837                        100,
1838                        row_converter::SortField::new(ConcreteDataType::string_datatype()),
1839                    ),
1840                    (
1841                        200,
1842                        row_converter::SortField::new(ConcreteDataType::string_datatype()),
1843                    ),
1844                ]
1845                .into_iter(),
1846            );
1847
1848            let values = [
1849                Value::UInt32(1000),
1850                Value::UInt64(2000),
1851                Value::String("abcdefgh".into()),
1852                Value::String("zyxwvu".into()),
1853            ];
1854            let mut buf = vec![];
1855            codec
1856                .encode_values(
1857                    &[
1858                        (ReservedColumnId::table_id(), values[0].clone()),
1859                        (ReservedColumnId::tsid(), values[1].clone()),
1860                        (100, values[2].clone()),
1861                        (200, values[3].clone()),
1862                    ],
1863                    &mut buf,
1864                )
1865                .unwrap();
1866
1867            let field_col_id = 2;
1868            let mut batch = new_batch_builder(
1869                &buf,
1870                &[1, 2, 3],
1871                &[1, 1, 1],
1872                &[OpType::Put, OpType::Put, OpType::Put],
1873                field_col_id,
1874                &[42, 43, 44],
1875            )
1876            .build()
1877            .unwrap();
1878
1879            let v = batch
1880                .pk_col_value(&*codec, 0, ReservedColumnId::table_id())
1881                .unwrap()
1882                .unwrap();
1883            assert_eq!(values[0], *v);
1884
1885            let v = batch
1886                .pk_col_value(&*codec, 1, ReservedColumnId::tsid())
1887                .unwrap()
1888                .unwrap();
1889            assert_eq!(values[1], *v);
1890
1891            let v = batch.pk_col_value(&*codec, 2, 100).unwrap().unwrap();
1892            assert_eq!(values[2], *v);
1893
1894            let v = batch.pk_col_value(&*codec, 3, 200).unwrap().unwrap();
1895            assert_eq!(values[3], *v);
1896
1897            let v = batch.field_col_value(field_col_id).unwrap();
1898            assert_eq!(v.data.get(0), Value::UInt64(42));
1899            assert_eq!(v.data.get(1), Value::UInt64(43));
1900            assert_eq!(v.data.get(2), Value::UInt64(44));
1901        }
1902    }
1903}