Skip to main content

mito2/
read.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Common structs and utilities for reading data.
16
17pub mod batch_adapter;
18pub mod compat;
19pub mod dedup;
20pub mod flat_dedup;
21pub mod flat_merge;
22pub mod flat_projection;
23pub mod last_row;
24pub mod merge;
25pub mod plain_batch;
26pub mod projection;
27pub(crate) mod prune;
28pub(crate) mod pruner;
29pub mod range;
30#[cfg(feature = "test")]
31pub mod range_cache;
32#[cfg(not(feature = "test"))]
33pub(crate) mod range_cache;
34pub mod scan_region;
35pub mod scan_util;
36pub(crate) mod seq_scan;
37pub mod series_scan;
38pub mod stream;
39pub(crate) mod unordered_scan;
40
41use std::collections::{HashMap, HashSet};
42use std::sync::Arc;
43use std::time::Duration;
44
45use api::v1::OpType;
46use async_trait::async_trait;
47use common_time::Timestamp;
48use datafusion_common::arrow::array::UInt8Array;
49use datatypes::arrow;
50use datatypes::arrow::array::{Array, ArrayRef};
51use datatypes::arrow::compute::SortOptions;
52use datatypes::arrow::record_batch::RecordBatch;
53use datatypes::arrow::row::{RowConverter, SortField};
54use datatypes::prelude::{ConcreteDataType, DataType, ScalarVector};
55use datatypes::scalars::ScalarVectorBuilder;
56use datatypes::types::TimestampType;
57use datatypes::value::{Value, ValueRef};
58use datatypes::vectors::{
59    BooleanVector, Helper, TimestampMicrosecondVector, TimestampMillisecondVector,
60    TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampSecondVector,
61    UInt8Vector, UInt8VectorBuilder, UInt32Vector, UInt64Vector, UInt64VectorBuilder, Vector,
62    VectorRef,
63};
64use futures::TryStreamExt;
65use futures::stream::BoxStream;
66use mito_codec::row_converter::{CompositeValues, PrimaryKeyCodec};
67use snafu::{OptionExt, ResultExt, ensure};
68use store_api::metadata::RegionMetadata;
69use store_api::storage::{ColumnId, SequenceNumber, SequenceRange};
70
71use crate::error::{
72    ComputeArrowSnafu, ComputeVectorSnafu, ConvertVectorSnafu, DecodeSnafu, InvalidBatchSnafu,
73    Result,
74};
75use crate::memtable::{BoxedBatchIterator, BoxedRecordBatchIterator};
76use crate::read::prune::PruneReader;
77
78/// Storage internal representation of a batch of rows for a primary key (time series).
79///
80/// Rows are sorted by primary key, timestamp, sequence desc, op_type desc. Fields
81/// always keep the same relative order as fields in [RegionMetadata](store_api::metadata::RegionMetadata).
82#[derive(Debug, PartialEq, Clone)]
83pub struct Batch {
84    /// Primary key encoded in a comparable form.
85    primary_key: Vec<u8>,
86    /// Possibly decoded `primary_key` values. Some places would decode it in advance.
87    pk_values: Option<CompositeValues>,
88    /// Timestamps of rows, should be sorted and not null.
89    timestamps: VectorRef,
90    /// Sequences of rows
91    ///
92    /// UInt64 type, not null.
93    sequences: Arc<UInt64Vector>,
94    /// Op types of rows
95    ///
96    /// UInt8 type, not null.
97    op_types: Arc<UInt8Vector>,
98    /// Fields organized in columnar format.
99    fields: Vec<BatchColumn>,
100    /// Cache for field index lookup.
101    fields_idx: Option<HashMap<ColumnId, usize>>,
102}
103
104impl Batch {
105    /// Creates a new batch.
106    pub fn new(
107        primary_key: Vec<u8>,
108        timestamps: VectorRef,
109        sequences: Arc<UInt64Vector>,
110        op_types: Arc<UInt8Vector>,
111        fields: Vec<BatchColumn>,
112    ) -> Result<Batch> {
113        BatchBuilder::with_required_columns(primary_key, timestamps, sequences, op_types)
114            .with_fields(fields)
115            .build()
116    }
117
118    /// Tries to set fields for the batch.
119    pub fn with_fields(self, fields: Vec<BatchColumn>) -> Result<Batch> {
120        Batch::new(
121            self.primary_key,
122            self.timestamps,
123            self.sequences,
124            self.op_types,
125            fields,
126        )
127    }
128
129    /// Returns primary key of the batch.
130    pub fn primary_key(&self) -> &[u8] {
131        &self.primary_key
132    }
133
134    /// Returns possibly decoded primary-key values.
135    pub fn pk_values(&self) -> Option<&CompositeValues> {
136        self.pk_values.as_ref()
137    }
138
139    /// Sets possibly decoded primary-key values.
140    pub fn set_pk_values(&mut self, pk_values: CompositeValues) {
141        self.pk_values = Some(pk_values);
142    }
143
144    /// Removes possibly decoded primary-key values. For testing only.
145    #[cfg(any(test, feature = "test"))]
146    pub fn remove_pk_values(&mut self) {
147        self.pk_values = None;
148    }
149
150    /// Returns fields in the batch.
151    pub fn fields(&self) -> &[BatchColumn] {
152        &self.fields
153    }
154
155    /// Returns timestamps of the batch.
156    pub fn timestamps(&self) -> &VectorRef {
157        &self.timestamps
158    }
159
160    /// Returns sequences of the batch.
161    pub fn sequences(&self) -> &Arc<UInt64Vector> {
162        &self.sequences
163    }
164
165    /// Returns op types of the batch.
166    pub fn op_types(&self) -> &Arc<UInt8Vector> {
167        &self.op_types
168    }
169
170    /// Returns the number of rows in the batch.
171    pub fn num_rows(&self) -> usize {
172        // All vectors have the same length. We use the length of sequences vector
173        // since it has static type.
174        self.sequences.len()
175    }
176
177    /// Create an empty [`Batch`].
178    pub(crate) fn empty() -> Self {
179        Self {
180            primary_key: vec![],
181            pk_values: None,
182            timestamps: Arc::new(TimestampMillisecondVectorBuilder::with_capacity(0).finish()),
183            sequences: Arc::new(UInt64VectorBuilder::with_capacity(0).finish()),
184            op_types: Arc::new(UInt8VectorBuilder::with_capacity(0).finish()),
185            fields: vec![],
186            fields_idx: None,
187        }
188    }
189
190    /// Returns true if the number of rows in the batch is 0.
191    pub fn is_empty(&self) -> bool {
192        self.num_rows() == 0
193    }
194
195    /// Returns the first timestamp in the batch or `None` if the batch is empty.
196    pub fn first_timestamp(&self) -> Option<Timestamp> {
197        if self.timestamps.is_empty() {
198            return None;
199        }
200
201        Some(self.get_timestamp(0))
202    }
203
204    /// Returns the last timestamp in the batch or `None` if the batch is empty.
205    pub fn last_timestamp(&self) -> Option<Timestamp> {
206        if self.timestamps.is_empty() {
207            return None;
208        }
209
210        Some(self.get_timestamp(self.timestamps.len() - 1))
211    }
212
213    /// Returns the first sequence in the batch or `None` if the batch is empty.
214    pub fn first_sequence(&self) -> Option<SequenceNumber> {
215        if self.sequences.is_empty() {
216            return None;
217        }
218
219        Some(self.get_sequence(0))
220    }
221
222    /// Returns the last sequence in the batch or `None` if the batch is empty.
223    pub fn last_sequence(&self) -> Option<SequenceNumber> {
224        if self.sequences.is_empty() {
225            return None;
226        }
227
228        Some(self.get_sequence(self.sequences.len() - 1))
229    }
230
231    /// Replaces the primary key of the batch.
232    ///
233    /// Notice that this [Batch] also contains a maybe-exist `pk_values`.
234    /// Be sure to update that field as well.
235    pub fn set_primary_key(&mut self, primary_key: Vec<u8>) {
236        self.primary_key = primary_key;
237    }
238
239    /// Slice the batch, returning a new batch.
240    ///
241    /// # Panics
242    /// Panics if `offset + length > self.num_rows()`.
243    pub fn slice(&self, offset: usize, length: usize) -> Batch {
244        let fields = self
245            .fields
246            .iter()
247            .map(|column| BatchColumn {
248                column_id: column.column_id,
249                data: column.data.slice(offset, length),
250            })
251            .collect();
252        // We skip using the builder to avoid validating the batch again.
253        Batch {
254            // Now we need to clone the primary key. We could try `Bytes` if
255            // this becomes a bottleneck.
256            primary_key: self.primary_key.clone(),
257            pk_values: self.pk_values.clone(),
258            timestamps: self.timestamps.slice(offset, length),
259            sequences: Arc::new(self.sequences.get_slice(offset, length)),
260            op_types: Arc::new(self.op_types.get_slice(offset, length)),
261            fields,
262            fields_idx: self.fields_idx.clone(),
263        }
264    }
265
266    /// Takes `batches` and concat them into one batch.
267    ///
268    /// All `batches` must have the same primary key.
269    pub fn concat(mut batches: Vec<Batch>) -> Result<Batch> {
270        ensure!(
271            !batches.is_empty(),
272            InvalidBatchSnafu {
273                reason: "empty batches",
274            }
275        );
276        if batches.len() == 1 {
277            // Now we own the `batches` so we could pop it directly.
278            return Ok(batches.pop().unwrap());
279        }
280
281        let primary_key = std::mem::take(&mut batches[0].primary_key);
282        let first = &batches[0];
283        // We took the primary key from the first batch so we don't use `first.primary_key()`.
284        ensure!(
285            batches
286                .iter()
287                .skip(1)
288                .all(|b| b.primary_key() == primary_key),
289            InvalidBatchSnafu {
290                reason: "batches have different primary key",
291            }
292        );
293        for b in batches.iter().skip(1) {
294            ensure!(
295                b.fields.len() == first.fields.len(),
296                InvalidBatchSnafu {
297                    reason: "batches have different field num",
298                }
299            );
300            for (l, r) in b.fields.iter().zip(&first.fields) {
301                ensure!(
302                    l.column_id == r.column_id,
303                    InvalidBatchSnafu {
304                        reason: "batches have different fields",
305                    }
306                );
307            }
308        }
309
310        // We take the primary key from the first batch.
311        let mut builder = BatchBuilder::new(primary_key);
312        // Concat timestamps, sequences, op_types, fields.
313        let array = concat_arrays(batches.iter().map(|b| b.timestamps().to_arrow_array()))?;
314        builder.timestamps_array(array)?;
315        let array = concat_arrays(batches.iter().map(|b| b.sequences().to_arrow_array()))?;
316        builder.sequences_array(array)?;
317        let array = concat_arrays(batches.iter().map(|b| b.op_types().to_arrow_array()))?;
318        builder.op_types_array(array)?;
319        for (i, batch_column) in first.fields.iter().enumerate() {
320            let array = concat_arrays(batches.iter().map(|b| b.fields()[i].data.to_arrow_array()))?;
321            builder.push_field_array(batch_column.column_id, array)?;
322        }
323
324        builder.build()
325    }
326
327    /// Removes rows whose op type is delete.
328    pub fn filter_deleted(&mut self) -> Result<()> {
329        // Safety: op type column is not null.
330        let array = self.op_types.as_arrow();
331        // Find rows with non-delete op type.
332        let rhs = UInt8Array::new_scalar(OpType::Delete as u8);
333        let predicate =
334            arrow::compute::kernels::cmp::neq(array, &rhs).context(ComputeArrowSnafu)?;
335        self.filter(&BooleanVector::from(predicate))
336    }
337
338    // Applies the `predicate` to the batch.
339    // Safety: We know the array type so we unwrap on casting.
340    pub fn filter(&mut self, predicate: &BooleanVector) -> Result<()> {
341        self.timestamps = self
342            .timestamps
343            .filter(predicate)
344            .context(ComputeVectorSnafu)?;
345        self.sequences = Arc::new(
346            UInt64Vector::try_from_arrow_array(
347                arrow::compute::filter(self.sequences.as_arrow(), predicate.as_boolean_array())
348                    .context(ComputeArrowSnafu)?,
349            )
350            .unwrap(),
351        );
352        self.op_types = Arc::new(
353            UInt8Vector::try_from_arrow_array(
354                arrow::compute::filter(self.op_types.as_arrow(), predicate.as_boolean_array())
355                    .context(ComputeArrowSnafu)?,
356            )
357            .unwrap(),
358        );
359        for batch_column in &mut self.fields {
360            batch_column.data = batch_column
361                .data
362                .filter(predicate)
363                .context(ComputeVectorSnafu)?;
364        }
365
366        Ok(())
367    }
368
369    /// Filters rows by the given `sequence`. Only preserves rows with sequence less than or equal to `sequence`.
370    pub fn filter_by_sequence(&mut self, sequence: Option<SequenceRange>) -> Result<()> {
371        let seq_range = match sequence {
372            None => return Ok(()),
373            Some(seq_range) => {
374                let (Some(first), Some(last)) = (self.first_sequence(), self.last_sequence())
375                else {
376                    return Ok(());
377                };
378                let is_subset = match seq_range {
379                    SequenceRange::Gt { min } => min < first,
380                    SequenceRange::LtEq { max } => max >= last,
381                    SequenceRange::GtLtEq { min, max } => min < first && max >= last,
382                };
383                if is_subset {
384                    return Ok(());
385                }
386                seq_range
387            }
388        };
389
390        let seqs = self.sequences.as_arrow();
391        let predicate = seq_range.filter(seqs).context(ComputeArrowSnafu)?;
392
393        let predicate = BooleanVector::from(predicate);
394        self.filter(&predicate)?;
395
396        Ok(())
397    }
398
399    /// Sorts rows in the batch. If `dedup` is true, it also removes
400    /// duplicated rows according to primary keys.
401    ///
402    /// It orders rows by timestamp, sequence desc and only keep the latest
403    /// row for the same timestamp. It doesn't consider op type as sequence
404    /// should already provide uniqueness for a row.
405    pub fn sort(&mut self, dedup: bool) -> Result<()> {
406        // If building a converter each time is costly, we may allow passing a
407        // converter.
408        let converter = RowConverter::new(vec![
409            SortField::new(self.timestamps.data_type().as_arrow_type()),
410            SortField::new_with_options(
411                self.sequences.data_type().as_arrow_type(),
412                SortOptions {
413                    descending: true,
414                    ..Default::default()
415                },
416            ),
417        ])
418        .context(ComputeArrowSnafu)?;
419        // Columns to sort.
420        let columns = [
421            self.timestamps.to_arrow_array(),
422            self.sequences.to_arrow_array(),
423        ];
424        let rows = converter.convert_columns(&columns).unwrap();
425        let mut to_sort: Vec<_> = rows.iter().enumerate().collect();
426
427        let was_sorted = to_sort.is_sorted_by_key(|x| x.1);
428        if !was_sorted {
429            to_sort.sort_unstable_by_key(|x| x.1);
430        }
431
432        let num_rows = to_sort.len();
433        if dedup {
434            // Dedup by timestamps.
435            to_sort.dedup_by(|left, right| {
436                debug_assert_eq!(18, left.1.as_ref().len());
437                debug_assert_eq!(18, right.1.as_ref().len());
438                let (left_key, right_key) = (left.1.as_ref(), right.1.as_ref());
439                // We only compare the timestamp part and ignore sequence.
440                left_key[..TIMESTAMP_KEY_LEN] == right_key[..TIMESTAMP_KEY_LEN]
441            });
442        }
443        let no_dedup = to_sort.len() == num_rows;
444
445        if was_sorted && no_dedup {
446            return Ok(());
447        }
448        let indices = UInt32Vector::from_iter_values(to_sort.iter().map(|v| v.0 as u32));
449        self.take_in_place(&indices)
450    }
451
452    /// Merges duplicated timestamps in the batch by keeping the latest non-null field values.
453    ///
454    /// Rows must already be sorted by timestamp (ascending) and sequence (descending).
455    ///
456    /// This method deduplicates rows with the same timestamp (keeping the first row in each
457    /// timestamp range as the base row) and fills null fields from subsequent rows until all
458    /// fields are filled or a delete operation is encountered.
459    pub(crate) fn merge_last_non_null(&mut self) -> Result<()> {
460        let num_rows = self.num_rows();
461        if num_rows < 2 {
462            return Ok(());
463        }
464
465        let Some(timestamps) = self.timestamps_native() else {
466            return Ok(());
467        };
468
469        // Fast path: check if there are any duplicate timestamps.
470        let mut has_dup = false;
471        let mut group_count = 1;
472        for i in 1..num_rows {
473            has_dup |= timestamps[i] == timestamps[i - 1];
474            group_count += (timestamps[i] != timestamps[i - 1]) as usize;
475        }
476        if !has_dup {
477            return Ok(());
478        }
479
480        let num_fields = self.fields.len();
481        let op_types = self.op_types.as_arrow().values();
482
483        let mut base_indices: Vec<u32> = Vec::with_capacity(group_count);
484        let mut field_indices: Vec<Vec<u32>> = (0..num_fields)
485            .map(|_| Vec::with_capacity(group_count))
486            .collect();
487
488        let mut start = 0;
489        while start < num_rows {
490            let ts = timestamps[start];
491            let mut end = start + 1;
492            while end < num_rows && timestamps[end] == ts {
493                end += 1;
494            }
495
496            let group_pos = base_indices.len();
497            base_indices.push(start as u32);
498
499            if num_fields > 0 {
500                // Default: take the base row for all fields.
501                for idx in &mut field_indices {
502                    idx.push(start as u32);
503                }
504
505                let base_deleted = op_types[start] == OpType::Delete as u8;
506                if !base_deleted {
507                    // Track fields that are null in the base row and try to fill them from older
508                    // rows in the same timestamp range.
509                    let mut missing_fields = Vec::new();
510                    for (field_idx, col) in self.fields.iter().enumerate() {
511                        if col.data.is_null(start) {
512                            missing_fields.push(field_idx);
513                        }
514                    }
515
516                    if !missing_fields.is_empty() {
517                        for row_idx in (start + 1)..end {
518                            if op_types[row_idx] == OpType::Delete as u8 {
519                                break;
520                            }
521
522                            missing_fields.retain(|&field_idx| {
523                                if self.fields[field_idx].data.is_null(row_idx) {
524                                    true
525                                } else {
526                                    field_indices[field_idx][group_pos] = row_idx as u32;
527                                    false
528                                }
529                            });
530
531                            if missing_fields.is_empty() {
532                                break;
533                            }
534                        }
535                    }
536                }
537            }
538
539            start = end;
540        }
541
542        let base_indices = UInt32Vector::from_vec(base_indices);
543        self.timestamps = self
544            .timestamps
545            .take(&base_indices)
546            .context(ComputeVectorSnafu)?;
547        let array = arrow::compute::take(self.sequences.as_arrow(), base_indices.as_arrow(), None)
548            .context(ComputeArrowSnafu)?;
549        // Safety: We know the array and vector type.
550        self.sequences = Arc::new(UInt64Vector::try_from_arrow_array(array).unwrap());
551        let array = arrow::compute::take(self.op_types.as_arrow(), base_indices.as_arrow(), None)
552            .context(ComputeArrowSnafu)?;
553        // Safety: We know the array and vector type.
554        self.op_types = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
555
556        for (field_idx, batch_column) in self.fields.iter_mut().enumerate() {
557            let idx = UInt32Vector::from_vec(std::mem::take(&mut field_indices[field_idx]));
558            batch_column.data = batch_column.data.take(&idx).context(ComputeVectorSnafu)?;
559        }
560
561        Ok(())
562    }
563
564    /// Returns the estimated memory size of the batch.
565    pub fn memory_size(&self) -> usize {
566        let mut size = std::mem::size_of::<Self>();
567        size += self.primary_key.len();
568        size += self.timestamps.memory_size();
569        size += self.sequences.memory_size();
570        size += self.op_types.memory_size();
571        for batch_column in &self.fields {
572            size += batch_column.data.memory_size();
573        }
574        size
575    }
576
577    /// Returns ids and datatypes of fields in the [Batch] after applying the `projection`.
578    pub(crate) fn projected_fields(
579        metadata: &RegionMetadata,
580        projection: &[ColumnId],
581    ) -> Vec<(ColumnId, ConcreteDataType)> {
582        let projected_ids: HashSet<_> = projection.iter().copied().collect();
583        metadata
584            .field_columns()
585            .filter_map(|column| {
586                if projected_ids.contains(&column.column_id) {
587                    Some((column.column_id, column.column_schema.data_type.clone()))
588                } else {
589                    None
590                }
591            })
592            .collect()
593    }
594
595    /// Returns timestamps in a native slice or `None` if the batch is empty.
596    pub(crate) fn timestamps_native(&self) -> Option<&[i64]> {
597        if self.timestamps.is_empty() {
598            return None;
599        }
600
601        let values = match self.timestamps.data_type() {
602            ConcreteDataType::Timestamp(TimestampType::Second(_)) => self
603                .timestamps
604                .as_any()
605                .downcast_ref::<TimestampSecondVector>()
606                .unwrap()
607                .as_arrow()
608                .values(),
609            ConcreteDataType::Timestamp(TimestampType::Millisecond(_)) => self
610                .timestamps
611                .as_any()
612                .downcast_ref::<TimestampMillisecondVector>()
613                .unwrap()
614                .as_arrow()
615                .values(),
616            ConcreteDataType::Timestamp(TimestampType::Microsecond(_)) => self
617                .timestamps
618                .as_any()
619                .downcast_ref::<TimestampMicrosecondVector>()
620                .unwrap()
621                .as_arrow()
622                .values(),
623            ConcreteDataType::Timestamp(TimestampType::Nanosecond(_)) => self
624                .timestamps
625                .as_any()
626                .downcast_ref::<TimestampNanosecondVector>()
627                .unwrap()
628                .as_arrow()
629                .values(),
630            other => panic!("timestamps in a Batch has other type {:?}", other),
631        };
632
633        Some(values)
634    }
635
636    /// Takes the batch in place.
637    fn take_in_place(&mut self, indices: &UInt32Vector) -> Result<()> {
638        self.timestamps = self.timestamps.take(indices).context(ComputeVectorSnafu)?;
639        let array = arrow::compute::take(self.sequences.as_arrow(), indices.as_arrow(), None)
640            .context(ComputeArrowSnafu)?;
641        // Safety: we know the array and vector type.
642        self.sequences = Arc::new(UInt64Vector::try_from_arrow_array(array).unwrap());
643        let array = arrow::compute::take(self.op_types.as_arrow(), indices.as_arrow(), None)
644            .context(ComputeArrowSnafu)?;
645        self.op_types = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
646        for batch_column in &mut self.fields {
647            batch_column.data = batch_column
648                .data
649                .take(indices)
650                .context(ComputeVectorSnafu)?;
651        }
652
653        Ok(())
654    }
655
656    /// Gets a timestamp at given `index`.
657    ///
658    /// # Panics
659    /// Panics if `index` is out-of-bound or the timestamp vector returns null.
660    fn get_timestamp(&self, index: usize) -> Timestamp {
661        match self.timestamps.get_ref(index) {
662            ValueRef::Timestamp(timestamp) => timestamp,
663
664            // We have check the data type is timestamp compatible in the [BatchBuilder] so it's safe to panic.
665            value => panic!("{:?} is not a timestamp", value),
666        }
667    }
668
669    /// Gets a sequence at given `index`.
670    ///
671    /// # Panics
672    /// Panics if `index` is out-of-bound or the sequence vector returns null.
673    pub(crate) fn get_sequence(&self, index: usize) -> SequenceNumber {
674        // Safety: sequences is not null so it actually returns Some.
675        self.sequences.get_data(index).unwrap()
676    }
677
678    /// Checks the batch is monotonic by timestamps.
679    #[cfg(debug_assertions)]
680    pub(crate) fn check_monotonic(&self) -> Result<(), String> {
681        use std::cmp::Ordering;
682        if self.timestamps_native().is_none() {
683            return Ok(());
684        }
685
686        let timestamps = self.timestamps_native().unwrap();
687        let sequences = self.sequences.as_arrow().values();
688        for (i, window) in timestamps.windows(2).enumerate() {
689            let current = window[0];
690            let next = window[1];
691            let current_sequence = sequences[i];
692            let next_sequence = sequences[i + 1];
693            match current.cmp(&next) {
694                Ordering::Less => {
695                    // The current timestamp is less than the next timestamp.
696                    continue;
697                }
698                Ordering::Equal => {
699                    // The current timestamp is equal to the next timestamp.
700                    if current_sequence < next_sequence {
701                        return Err(format!(
702                            "sequence are not monotonic: ts {} == {} but current sequence {} < {}, index: {}",
703                            current, next, current_sequence, next_sequence, i
704                        ));
705                    }
706                }
707                Ordering::Greater => {
708                    // The current timestamp is greater than the next timestamp.
709                    return Err(format!(
710                        "timestamps are not monotonic: {} > {}, index: {}",
711                        current, next, i
712                    ));
713                }
714            }
715        }
716
717        Ok(())
718    }
719
720    /// Returns Ok if the given batch is behind the current batch.
721    #[cfg(debug_assertions)]
722    pub(crate) fn check_next_batch(&self, other: &Batch) -> Result<(), String> {
723        // Checks the primary key
724        if self.primary_key() < other.primary_key() {
725            return Ok(());
726        }
727        if self.primary_key() > other.primary_key() {
728            return Err(format!(
729                "primary key is not monotonic: {:?} > {:?}",
730                self.primary_key(),
731                other.primary_key()
732            ));
733        }
734        // Checks the timestamp.
735        if self.last_timestamp() < other.first_timestamp() {
736            return Ok(());
737        }
738        if self.last_timestamp() > other.first_timestamp() {
739            return Err(format!(
740                "timestamps are not monotonic: {:?} > {:?}",
741                self.last_timestamp(),
742                other.first_timestamp()
743            ));
744        }
745        // Checks the sequence.
746        if self.last_sequence() >= other.first_sequence() {
747            return Ok(());
748        }
749        Err(format!(
750            "sequences are not monotonic: {:?} < {:?}",
751            self.last_sequence(),
752            other.first_sequence()
753        ))
754    }
755
756    /// Returns the value of the column in the primary key.
757    ///
758    /// Lazily decodes the primary key and caches the result.
759    pub fn pk_col_value(
760        &mut self,
761        codec: &dyn PrimaryKeyCodec,
762        col_idx_in_pk: usize,
763        column_id: ColumnId,
764    ) -> Result<Option<&Value>> {
765        if self.pk_values.is_none() {
766            self.pk_values = Some(codec.decode(&self.primary_key).context(DecodeSnafu)?);
767        }
768
769        let pk_values = self.pk_values.as_ref().unwrap();
770        Ok(match pk_values {
771            CompositeValues::Dense(values) => values.get(col_idx_in_pk).map(|(_, v)| v),
772            CompositeValues::Sparse(values) => values.get(&column_id),
773        })
774    }
775
776    /// Returns values of the field in the batch.
777    ///
778    /// Lazily caches the field index.
779    pub fn field_col_value(&mut self, column_id: ColumnId) -> Option<&BatchColumn> {
780        if self.fields_idx.is_none() {
781            self.fields_idx = Some(
782                self.fields
783                    .iter()
784                    .enumerate()
785                    .map(|(i, c)| (c.column_id, i))
786                    .collect(),
787            );
788        }
789
790        self.fields_idx
791            .as_ref()
792            .unwrap()
793            .get(&column_id)
794            .map(|&idx| &self.fields[idx])
795    }
796}
797
798/// A struct to check the batch is monotonic.
799#[cfg(debug_assertions)]
800#[derive(Default)]
801pub(crate) struct BatchChecker {
802    last_batch: Option<Batch>,
803    start: Option<Timestamp>,
804    end: Option<Timestamp>,
805}
806
807#[cfg(debug_assertions)]
808impl BatchChecker {
809    /// Attaches the given start timestamp to the checker.
810    pub(crate) fn with_start(mut self, start: Option<Timestamp>) -> Self {
811        self.start = start;
812        self
813    }
814
815    /// Attaches the given end timestamp to the checker.
816    pub(crate) fn with_end(mut self, end: Option<Timestamp>) -> Self {
817        self.end = end;
818        self
819    }
820
821    /// Returns true if the given batch is monotonic and behind
822    /// the last batch.
823    pub(crate) fn check_monotonic(&mut self, batch: &Batch) -> Result<(), String> {
824        batch.check_monotonic()?;
825
826        if let (Some(start), Some(first)) = (self.start, batch.first_timestamp())
827            && start > first
828        {
829            return Err(format!(
830                "batch's first timestamp is before the start timestamp: {:?} > {:?}",
831                start, first
832            ));
833        }
834        if let (Some(end), Some(last)) = (self.end, batch.last_timestamp())
835            && end <= last
836        {
837            return Err(format!(
838                "batch's last timestamp is after the end timestamp: {:?} <= {:?}",
839                end, last
840            ));
841        }
842
843        // Checks the batch is behind the last batch.
844        // Then Updates the last batch.
845        let res = self
846            .last_batch
847            .as_ref()
848            .map(|last| last.check_next_batch(batch))
849            .unwrap_or(Ok(()));
850        self.last_batch = Some(batch.clone());
851        res
852    }
853
854    /// Formats current batch and last batch for debug.
855    pub(crate) fn format_batch(&self, batch: &Batch) -> String {
856        use std::fmt::Write;
857
858        let mut message = String::new();
859        if let Some(last) = &self.last_batch {
860            write!(
861                message,
862                "last_pk: {:?}, last_ts: {:?}, last_seq: {:?}, ",
863                last.primary_key(),
864                last.last_timestamp(),
865                last.last_sequence()
866            )
867            .unwrap();
868        }
869        write!(
870            message,
871            "batch_pk: {:?}, batch_ts: {:?}, batch_seq: {:?}",
872            batch.primary_key(),
873            batch.timestamps(),
874            batch.sequences()
875        )
876        .unwrap();
877
878        message
879    }
880
881    /// Checks batches from the part range are monotonic. Otherwise, panics.
882    pub(crate) fn ensure_part_range_batch(
883        &mut self,
884        scanner: &str,
885        region_id: store_api::storage::RegionId,
886        partition: usize,
887        part_range: store_api::region_engine::PartitionRange,
888        batch: &Batch,
889    ) {
890        if let Err(e) = self.check_monotonic(batch) {
891            let err_msg = format!(
892                "{}: batch is not sorted, {}, region_id: {}, partition: {}, part_range: {:?}",
893                scanner, e, region_id, partition, part_range,
894            );
895            common_telemetry::error!("{err_msg}, {}", self.format_batch(batch));
896            // Only print the number of row in the panic message.
897            panic!("{err_msg}, batch rows: {}", batch.num_rows());
898        }
899    }
900}
901
902/// Len of timestamp in arrow row format.
903const TIMESTAMP_KEY_LEN: usize = 9;
904
905/// Helper function to concat arrays from `iter`.
906fn concat_arrays(iter: impl Iterator<Item = ArrayRef>) -> Result<ArrayRef> {
907    let arrays: Vec<_> = iter.collect();
908    let dyn_arrays: Vec<_> = arrays.iter().map(|array| array.as_ref()).collect();
909    arrow::compute::concat(&dyn_arrays).context(ComputeArrowSnafu)
910}
911
912/// A column in a [Batch].
913#[derive(Debug, PartialEq, Eq, Clone)]
914pub struct BatchColumn {
915    /// Id of the column.
916    pub column_id: ColumnId,
917    /// Data of the column.
918    pub data: VectorRef,
919}
920
921/// Builder to build [Batch].
922pub struct BatchBuilder {
923    primary_key: Vec<u8>,
924    timestamps: Option<VectorRef>,
925    sequences: Option<Arc<UInt64Vector>>,
926    op_types: Option<Arc<UInt8Vector>>,
927    fields: Vec<BatchColumn>,
928}
929
930impl BatchBuilder {
931    /// Creates a new [BatchBuilder] with primary key.
932    pub fn new(primary_key: Vec<u8>) -> BatchBuilder {
933        BatchBuilder {
934            primary_key,
935            timestamps: None,
936            sequences: None,
937            op_types: None,
938            fields: Vec::new(),
939        }
940    }
941
942    /// Creates a new [BatchBuilder] with all required columns.
943    pub fn with_required_columns(
944        primary_key: Vec<u8>,
945        timestamps: VectorRef,
946        sequences: Arc<UInt64Vector>,
947        op_types: Arc<UInt8Vector>,
948    ) -> BatchBuilder {
949        BatchBuilder {
950            primary_key,
951            timestamps: Some(timestamps),
952            sequences: Some(sequences),
953            op_types: Some(op_types),
954            fields: Vec::new(),
955        }
956    }
957
958    /// Set all field columns.
959    pub fn with_fields(mut self, fields: Vec<BatchColumn>) -> Self {
960        self.fields = fields;
961        self
962    }
963
964    /// Push a field column.
965    pub fn push_field(&mut self, column: BatchColumn) -> &mut Self {
966        self.fields.push(column);
967        self
968    }
969
970    /// Push an array as a field.
971    pub fn push_field_array(&mut self, column_id: ColumnId, array: ArrayRef) -> Result<&mut Self> {
972        let vector = Helper::try_into_vector(array).context(ConvertVectorSnafu)?;
973        self.fields.push(BatchColumn {
974            column_id,
975            data: vector,
976        });
977
978        Ok(self)
979    }
980
981    /// Try to set an array as timestamps.
982    pub fn timestamps_array(&mut self, array: ArrayRef) -> Result<&mut Self> {
983        let vector = Helper::try_into_vector(array).context(ConvertVectorSnafu)?;
984        ensure!(
985            vector.data_type().is_timestamp(),
986            InvalidBatchSnafu {
987                reason: format!("{:?} is not a timestamp type", vector.data_type()),
988            }
989        );
990
991        self.timestamps = Some(vector);
992        Ok(self)
993    }
994
995    /// Try to set an array as sequences.
996    pub fn sequences_array(&mut self, array: ArrayRef) -> Result<&mut Self> {
997        ensure!(
998            *array.data_type() == arrow::datatypes::DataType::UInt64,
999            InvalidBatchSnafu {
1000                reason: "sequence array is not UInt64 type",
1001            }
1002        );
1003        // Safety: The cast must success as we have ensured it is uint64 type.
1004        let vector = Arc::new(UInt64Vector::try_from_arrow_array(array).unwrap());
1005        self.sequences = Some(vector);
1006
1007        Ok(self)
1008    }
1009
1010    /// Try to set an array as op types.
1011    pub fn op_types_array(&mut self, array: ArrayRef) -> Result<&mut Self> {
1012        ensure!(
1013            *array.data_type() == arrow::datatypes::DataType::UInt8,
1014            InvalidBatchSnafu {
1015                reason: "sequence array is not UInt8 type",
1016            }
1017        );
1018        // Safety: The cast must success as we have ensured it is uint64 type.
1019        let vector = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
1020        self.op_types = Some(vector);
1021
1022        Ok(self)
1023    }
1024
1025    /// Builds the [Batch].
1026    pub fn build(self) -> Result<Batch> {
1027        let timestamps = self.timestamps.context(InvalidBatchSnafu {
1028            reason: "missing timestamps",
1029        })?;
1030        let sequences = self.sequences.context(InvalidBatchSnafu {
1031            reason: "missing sequences",
1032        })?;
1033        let op_types = self.op_types.context(InvalidBatchSnafu {
1034            reason: "missing op_types",
1035        })?;
1036        // Our storage format ensure these columns are not nullable so
1037        // we use assert here.
1038        assert_eq!(0, timestamps.null_count());
1039        assert_eq!(0, sequences.null_count());
1040        assert_eq!(0, op_types.null_count());
1041
1042        let ts_len = timestamps.len();
1043        ensure!(
1044            sequences.len() == ts_len,
1045            InvalidBatchSnafu {
1046                reason: format!(
1047                    "sequence have different len {} != {}",
1048                    sequences.len(),
1049                    ts_len
1050                ),
1051            }
1052        );
1053        ensure!(
1054            op_types.len() == ts_len,
1055            InvalidBatchSnafu {
1056                reason: format!(
1057                    "op type have different len {} != {}",
1058                    op_types.len(),
1059                    ts_len
1060                ),
1061            }
1062        );
1063        for column in &self.fields {
1064            ensure!(
1065                column.data.len() == ts_len,
1066                InvalidBatchSnafu {
1067                    reason: format!(
1068                        "column {} has different len {} != {}",
1069                        column.column_id,
1070                        column.data.len(),
1071                        ts_len
1072                    ),
1073                }
1074            );
1075        }
1076
1077        Ok(Batch {
1078            primary_key: self.primary_key,
1079            pk_values: None,
1080            timestamps,
1081            sequences,
1082            op_types,
1083            fields: self.fields,
1084            fields_idx: None,
1085        })
1086    }
1087}
1088
1089impl From<Batch> for BatchBuilder {
1090    fn from(batch: Batch) -> Self {
1091        Self {
1092            primary_key: batch.primary_key,
1093            timestamps: Some(batch.timestamps),
1094            sequences: Some(batch.sequences),
1095            op_types: Some(batch.op_types),
1096            fields: batch.fields,
1097        }
1098    }
1099}
1100
1101/// Async [Batch] reader and iterator wrapper.
1102///
1103/// This is the data source for SST writers or internal readers.
1104pub enum Source {
1105    /// Source from a [BoxedBatchReader].
1106    Reader(BoxedBatchReader),
1107    /// Source from a [BoxedBatchIterator].
1108    Iter(BoxedBatchIterator),
1109    /// Source from a [BoxedBatchStream].
1110    Stream(BoxedBatchStream),
1111    /// Source from a [PruneReader].
1112    PruneReader(PruneReader),
1113}
1114
1115impl Source {
1116    /// Returns next [Batch] from this data source.
1117    pub async fn next_batch(&mut self) -> Result<Option<Batch>> {
1118        match self {
1119            Source::Reader(reader) => reader.next_batch().await,
1120            Source::Iter(iter) => iter.next().transpose(),
1121            Source::Stream(stream) => stream.try_next().await,
1122            Source::PruneReader(reader) => reader.next_batch().await,
1123        }
1124    }
1125}
1126
1127/// Async [RecordBatch] reader and iterator wrapper for flat format.
1128pub enum FlatSource {
1129    /// Source from a [BoxedRecordBatchIterator].
1130    Iter(BoxedRecordBatchIterator),
1131    /// Source from a [BoxedRecordBatchStream].
1132    Stream(BoxedRecordBatchStream),
1133}
1134
1135impl FlatSource {
1136    /// Returns next [RecordBatch] from this data source.
1137    pub async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
1138        match self {
1139            FlatSource::Iter(iter) => iter.next().transpose(),
1140            FlatSource::Stream(stream) => stream.try_next().await,
1141        }
1142    }
1143}
1144
1145/// Async batch reader.
1146///
1147/// The reader must guarantee [Batch]es returned by it have the same schema.
1148#[async_trait]
1149pub trait BatchReader: Send {
1150    /// Fetch next [Batch].
1151    ///
1152    /// Returns `Ok(None)` when the reader has reached its end and calling `next_batch()`
1153    /// again won't return batch again.
1154    ///
1155    /// If `Err` is returned, caller should not call this method again, the implementor
1156    /// may or may not panic in such case.
1157    async fn next_batch(&mut self) -> Result<Option<Batch>>;
1158}
1159
1160/// Pointer to [BatchReader].
1161pub type BoxedBatchReader = Box<dyn BatchReader>;
1162
1163/// Pointer to a stream that yields [Batch].
1164pub type BoxedBatchStream = BoxStream<'static, Result<Batch>>;
1165
1166/// Pointer to a stream that yields [RecordBatch].
1167pub type BoxedRecordBatchStream = BoxStream<'static, Result<RecordBatch>>;
1168
1169#[async_trait::async_trait]
1170impl<T: BatchReader + ?Sized> BatchReader for Box<T> {
1171    async fn next_batch(&mut self) -> Result<Option<Batch>> {
1172        (**self).next_batch().await
1173    }
1174}
1175
1176/// Local metrics for scanners.
1177#[derive(Debug, Default)]
1178pub(crate) struct ScannerMetrics {
1179    /// Duration to scan data.
1180    scan_cost: Duration,
1181    /// Duration while waiting for `yield`.
1182    yield_cost: Duration,
1183    /// Number of batches returned.
1184    num_batches: usize,
1185    /// Number of rows returned.
1186    num_rows: usize,
1187}
1188
1189#[cfg(test)]
1190mod tests {
1191    use datatypes::arrow::array::{TimestampMillisecondArray, UInt8Array, UInt64Array};
1192    use mito_codec::row_converter::{self, build_primary_key_codec_with_fields};
1193    use store_api::codec::PrimaryKeyEncoding;
1194    use store_api::storage::consts::ReservedColumnId;
1195
1196    use super::*;
1197    use crate::error::Error;
1198    use crate::test_util::new_batch_builder;
1199
1200    fn new_batch(
1201        timestamps: &[i64],
1202        sequences: &[u64],
1203        op_types: &[OpType],
1204        field: &[u64],
1205    ) -> Batch {
1206        new_batch_builder(b"test", timestamps, sequences, op_types, 1, field)
1207            .build()
1208            .unwrap()
1209    }
1210
1211    fn new_batch_with_u64_fields(
1212        timestamps: &[i64],
1213        sequences: &[u64],
1214        op_types: &[OpType],
1215        fields: &[(ColumnId, &[Option<u64>])],
1216    ) -> Batch {
1217        assert_eq!(timestamps.len(), sequences.len());
1218        assert_eq!(timestamps.len(), op_types.len());
1219        for (_, values) in fields {
1220            assert_eq!(timestamps.len(), values.len());
1221        }
1222
1223        let mut builder = BatchBuilder::new(b"test".to_vec());
1224        builder
1225            .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values(
1226                timestamps.iter().copied(),
1227            )))
1228            .unwrap()
1229            .sequences_array(Arc::new(UInt64Array::from_iter_values(
1230                sequences.iter().copied(),
1231            )))
1232            .unwrap()
1233            .op_types_array(Arc::new(UInt8Array::from_iter_values(
1234                op_types.iter().map(|v| *v as u8),
1235            )))
1236            .unwrap();
1237
1238        for (col_id, values) in fields {
1239            builder
1240                .push_field_array(*col_id, Arc::new(UInt64Array::from(values.to_vec())))
1241                .unwrap();
1242        }
1243
1244        builder.build().unwrap()
1245    }
1246
1247    fn new_batch_without_fields(
1248        timestamps: &[i64],
1249        sequences: &[u64],
1250        op_types: &[OpType],
1251    ) -> Batch {
1252        assert_eq!(timestamps.len(), sequences.len());
1253        assert_eq!(timestamps.len(), op_types.len());
1254
1255        let mut builder = BatchBuilder::new(b"test".to_vec());
1256        builder
1257            .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values(
1258                timestamps.iter().copied(),
1259            )))
1260            .unwrap()
1261            .sequences_array(Arc::new(UInt64Array::from_iter_values(
1262                sequences.iter().copied(),
1263            )))
1264            .unwrap()
1265            .op_types_array(Arc::new(UInt8Array::from_iter_values(
1266                op_types.iter().map(|v| *v as u8),
1267            )))
1268            .unwrap();
1269
1270        builder.build().unwrap()
1271    }
1272
1273    #[test]
1274    fn test_empty_batch() {
1275        let batch = Batch::empty();
1276        assert!(batch.is_empty());
1277        assert_eq!(None, batch.first_timestamp());
1278        assert_eq!(None, batch.last_timestamp());
1279        assert_eq!(None, batch.first_sequence());
1280        assert_eq!(None, batch.last_sequence());
1281        assert!(batch.timestamps_native().is_none());
1282    }
1283
1284    #[test]
1285    fn test_first_last_one() {
1286        let batch = new_batch(&[1], &[2], &[OpType::Put], &[4]);
1287        assert_eq!(
1288            Timestamp::new_millisecond(1),
1289            batch.first_timestamp().unwrap()
1290        );
1291        assert_eq!(
1292            Timestamp::new_millisecond(1),
1293            batch.last_timestamp().unwrap()
1294        );
1295        assert_eq!(2, batch.first_sequence().unwrap());
1296        assert_eq!(2, batch.last_sequence().unwrap());
1297    }
1298
1299    #[test]
1300    fn test_first_last_multiple() {
1301        let batch = new_batch(
1302            &[1, 2, 3],
1303            &[11, 12, 13],
1304            &[OpType::Put, OpType::Put, OpType::Put],
1305            &[21, 22, 23],
1306        );
1307        assert_eq!(
1308            Timestamp::new_millisecond(1),
1309            batch.first_timestamp().unwrap()
1310        );
1311        assert_eq!(
1312            Timestamp::new_millisecond(3),
1313            batch.last_timestamp().unwrap()
1314        );
1315        assert_eq!(11, batch.first_sequence().unwrap());
1316        assert_eq!(13, batch.last_sequence().unwrap());
1317    }
1318
1319    #[test]
1320    fn test_slice() {
1321        let batch = new_batch(
1322            &[1, 2, 3, 4],
1323            &[11, 12, 13, 14],
1324            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1325            &[21, 22, 23, 24],
1326        );
1327        let batch = batch.slice(1, 2);
1328        let expect = new_batch(
1329            &[2, 3],
1330            &[12, 13],
1331            &[OpType::Delete, OpType::Put],
1332            &[22, 23],
1333        );
1334        assert_eq!(expect, batch);
1335    }
1336
1337    #[test]
1338    fn test_timestamps_native() {
1339        let batch = new_batch(
1340            &[1, 2, 3, 4],
1341            &[11, 12, 13, 14],
1342            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1343            &[21, 22, 23, 24],
1344        );
1345        assert_eq!(&[1, 2, 3, 4], batch.timestamps_native().unwrap());
1346    }
1347
1348    #[test]
1349    fn test_concat_empty() {
1350        let err = Batch::concat(vec![]).unwrap_err();
1351        assert!(
1352            matches!(err, Error::InvalidBatch { .. }),
1353            "unexpected err: {err}"
1354        );
1355    }
1356
1357    #[test]
1358    fn test_concat_one() {
1359        let batch = new_batch(&[], &[], &[], &[]);
1360        let actual = Batch::concat(vec![batch.clone()]).unwrap();
1361        assert_eq!(batch, actual);
1362
1363        let batch = new_batch(&[1, 2], &[11, 12], &[OpType::Put, OpType::Put], &[21, 22]);
1364        let actual = Batch::concat(vec![batch.clone()]).unwrap();
1365        assert_eq!(batch, actual);
1366    }
1367
1368    #[test]
1369    fn test_concat_multiple() {
1370        let batches = vec![
1371            new_batch(&[1, 2], &[11, 12], &[OpType::Put, OpType::Put], &[21, 22]),
1372            new_batch(
1373                &[3, 4, 5],
1374                &[13, 14, 15],
1375                &[OpType::Put, OpType::Delete, OpType::Put],
1376                &[23, 24, 25],
1377            ),
1378            new_batch(&[], &[], &[], &[]),
1379            new_batch(&[6], &[16], &[OpType::Put], &[26]),
1380        ];
1381        let batch = Batch::concat(batches).unwrap();
1382        let expect = new_batch(
1383            &[1, 2, 3, 4, 5, 6],
1384            &[11, 12, 13, 14, 15, 16],
1385            &[
1386                OpType::Put,
1387                OpType::Put,
1388                OpType::Put,
1389                OpType::Delete,
1390                OpType::Put,
1391                OpType::Put,
1392            ],
1393            &[21, 22, 23, 24, 25, 26],
1394        );
1395        assert_eq!(expect, batch);
1396    }
1397
1398    #[test]
1399    fn test_concat_different() {
1400        let batch1 = new_batch(&[1], &[1], &[OpType::Put], &[1]);
1401        let mut batch2 = new_batch(&[2], &[2], &[OpType::Put], &[2]);
1402        batch2.primary_key = b"hello".to_vec();
1403        let err = Batch::concat(vec![batch1, batch2]).unwrap_err();
1404        assert!(
1405            matches!(err, Error::InvalidBatch { .. }),
1406            "unexpected err: {err}"
1407        );
1408    }
1409
1410    #[test]
1411    fn test_concat_different_fields() {
1412        let batch1 = new_batch(&[1], &[1], &[OpType::Put], &[1]);
1413        let fields = vec![
1414            batch1.fields()[0].clone(),
1415            BatchColumn {
1416                column_id: 2,
1417                data: Arc::new(UInt64Vector::from_slice([2])),
1418            },
1419        ];
1420        // Batch 2 has more fields.
1421        let batch2 = batch1.clone().with_fields(fields).unwrap();
1422        let err = Batch::concat(vec![batch1.clone(), batch2]).unwrap_err();
1423        assert!(
1424            matches!(err, Error::InvalidBatch { .. }),
1425            "unexpected err: {err}"
1426        );
1427
1428        // Batch 2 has different field.
1429        let fields = vec![BatchColumn {
1430            column_id: 2,
1431            data: Arc::new(UInt64Vector::from_slice([2])),
1432        }];
1433        let batch2 = batch1.clone().with_fields(fields).unwrap();
1434        let err = Batch::concat(vec![batch1, batch2]).unwrap_err();
1435        assert!(
1436            matches!(err, Error::InvalidBatch { .. }),
1437            "unexpected err: {err}"
1438        );
1439    }
1440
1441    #[test]
1442    fn test_filter_deleted_empty() {
1443        let mut batch = new_batch(&[], &[], &[], &[]);
1444        batch.filter_deleted().unwrap();
1445        assert!(batch.is_empty());
1446    }
1447
1448    #[test]
1449    fn test_filter_deleted() {
1450        let mut batch = new_batch(
1451            &[1, 2, 3, 4],
1452            &[11, 12, 13, 14],
1453            &[OpType::Delete, OpType::Put, OpType::Delete, OpType::Put],
1454            &[21, 22, 23, 24],
1455        );
1456        batch.filter_deleted().unwrap();
1457        let expect = new_batch(&[2, 4], &[12, 14], &[OpType::Put, OpType::Put], &[22, 24]);
1458        assert_eq!(expect, batch);
1459
1460        let mut batch = new_batch(
1461            &[1, 2, 3, 4],
1462            &[11, 12, 13, 14],
1463            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1464            &[21, 22, 23, 24],
1465        );
1466        let expect = batch.clone();
1467        batch.filter_deleted().unwrap();
1468        assert_eq!(expect, batch);
1469    }
1470
1471    #[test]
1472    fn test_filter_by_sequence() {
1473        // Filters put only.
1474        let mut batch = new_batch(
1475            &[1, 2, 3, 4],
1476            &[11, 12, 13, 14],
1477            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1478            &[21, 22, 23, 24],
1479        );
1480        batch
1481            .filter_by_sequence(Some(SequenceRange::LtEq { max: 13 }))
1482            .unwrap();
1483        let expect = new_batch(
1484            &[1, 2, 3],
1485            &[11, 12, 13],
1486            &[OpType::Put, OpType::Put, OpType::Put],
1487            &[21, 22, 23],
1488        );
1489        assert_eq!(expect, batch);
1490
1491        // Filters to empty.
1492        let mut batch = new_batch(
1493            &[1, 2, 3, 4],
1494            &[11, 12, 13, 14],
1495            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1496            &[21, 22, 23, 24],
1497        );
1498
1499        batch
1500            .filter_by_sequence(Some(SequenceRange::LtEq { max: 10 }))
1501            .unwrap();
1502        assert!(batch.is_empty());
1503
1504        // None filter.
1505        let mut batch = new_batch(
1506            &[1, 2, 3, 4],
1507            &[11, 12, 13, 14],
1508            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1509            &[21, 22, 23, 24],
1510        );
1511        let expect = batch.clone();
1512        batch.filter_by_sequence(None).unwrap();
1513        assert_eq!(expect, batch);
1514
1515        // Filter a empty batch
1516        let mut batch = new_batch(&[], &[], &[], &[]);
1517        batch
1518            .filter_by_sequence(Some(SequenceRange::LtEq { max: 10 }))
1519            .unwrap();
1520        assert!(batch.is_empty());
1521
1522        // Filter a empty batch with None
1523        let mut batch = new_batch(&[], &[], &[], &[]);
1524        batch.filter_by_sequence(None).unwrap();
1525        assert!(batch.is_empty());
1526
1527        // Test From variant - exclusive lower bound
1528        let mut batch = new_batch(
1529            &[1, 2, 3, 4],
1530            &[11, 12, 13, 14],
1531            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1532            &[21, 22, 23, 24],
1533        );
1534        batch
1535            .filter_by_sequence(Some(SequenceRange::Gt { min: 12 }))
1536            .unwrap();
1537        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1538        assert_eq!(expect, batch);
1539
1540        // Test From variant with no matches
1541        let mut batch = new_batch(
1542            &[1, 2, 3, 4],
1543            &[11, 12, 13, 14],
1544            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1545            &[21, 22, 23, 24],
1546        );
1547        batch
1548            .filter_by_sequence(Some(SequenceRange::Gt { min: 20 }))
1549            .unwrap();
1550        assert!(batch.is_empty());
1551
1552        // Test Range variant - exclusive lower bound, inclusive upper bound
1553        let mut batch = new_batch(
1554            &[1, 2, 3, 4, 5],
1555            &[11, 12, 13, 14, 15],
1556            &[
1557                OpType::Put,
1558                OpType::Put,
1559                OpType::Put,
1560                OpType::Put,
1561                OpType::Put,
1562            ],
1563            &[21, 22, 23, 24, 25],
1564        );
1565        batch
1566            .filter_by_sequence(Some(SequenceRange::GtLtEq { min: 12, max: 14 }))
1567            .unwrap();
1568        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1569        assert_eq!(expect, batch);
1570
1571        // Test Range variant with mixed operations
1572        let mut batch = new_batch(
1573            &[1, 2, 3, 4, 5],
1574            &[11, 12, 13, 14, 15],
1575            &[
1576                OpType::Put,
1577                OpType::Delete,
1578                OpType::Put,
1579                OpType::Delete,
1580                OpType::Put,
1581            ],
1582            &[21, 22, 23, 24, 25],
1583        );
1584        batch
1585            .filter_by_sequence(Some(SequenceRange::GtLtEq { min: 11, max: 13 }))
1586            .unwrap();
1587        let expect = new_batch(
1588            &[2, 3],
1589            &[12, 13],
1590            &[OpType::Delete, OpType::Put],
1591            &[22, 23],
1592        );
1593        assert_eq!(expect, batch);
1594
1595        // Test Range variant with no matches
1596        let mut batch = new_batch(
1597            &[1, 2, 3, 4],
1598            &[11, 12, 13, 14],
1599            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1600            &[21, 22, 23, 24],
1601        );
1602        batch
1603            .filter_by_sequence(Some(SequenceRange::GtLtEq { min: 20, max: 25 }))
1604            .unwrap();
1605        assert!(batch.is_empty());
1606    }
1607
1608    #[test]
1609    fn test_merge_last_non_null_no_dup() {
1610        let mut batch = new_batch_with_u64_fields(
1611            &[1, 2],
1612            &[2, 1],
1613            &[OpType::Put, OpType::Put],
1614            &[(1, &[Some(10), None]), (2, &[Some(100), Some(200)])],
1615        );
1616        let expect = batch.clone();
1617        batch.merge_last_non_null().unwrap();
1618        assert_eq!(expect, batch);
1619    }
1620
1621    #[test]
1622    fn test_merge_last_non_null_fill_null_fields() {
1623        // Rows are already sorted by timestamp asc and sequence desc.
1624        let mut batch = new_batch_with_u64_fields(
1625            &[1, 1, 1],
1626            &[3, 2, 1],
1627            &[OpType::Put, OpType::Put, OpType::Put],
1628            &[
1629                (1, &[None, Some(10), Some(11)]),
1630                (2, &[Some(100), Some(200), Some(300)]),
1631            ],
1632        );
1633        batch.merge_last_non_null().unwrap();
1634
1635        // Field 1 is filled from the first older row (seq=2). Field 2 keeps the base value.
1636        // Filled fields must not be overwritten by even older duplicates.
1637        let expect = new_batch_with_u64_fields(
1638            &[1],
1639            &[3],
1640            &[OpType::Put],
1641            &[(1, &[Some(10)]), (2, &[Some(100)])],
1642        );
1643        assert_eq!(expect, batch);
1644    }
1645
1646    #[test]
1647    fn test_merge_last_non_null_stop_at_delete_row() {
1648        // A delete row in older duplicates should stop filling to avoid resurrecting values before
1649        // deletion.
1650        let mut batch = new_batch_with_u64_fields(
1651            &[1, 1, 1],
1652            &[3, 2, 1],
1653            &[OpType::Put, OpType::Delete, OpType::Put],
1654            &[
1655                (1, &[None, Some(10), Some(11)]),
1656                (2, &[Some(100), Some(200), Some(300)]),
1657            ],
1658        );
1659        batch.merge_last_non_null().unwrap();
1660
1661        let expect = new_batch_with_u64_fields(
1662            &[1],
1663            &[3],
1664            &[OpType::Put],
1665            &[(1, &[None]), (2, &[Some(100)])],
1666        );
1667        assert_eq!(expect, batch);
1668    }
1669
1670    #[test]
1671    fn test_merge_last_non_null_base_delete_no_merge() {
1672        let mut batch = new_batch_with_u64_fields(
1673            &[1, 1],
1674            &[3, 2],
1675            &[OpType::Delete, OpType::Put],
1676            &[(1, &[None, Some(10)]), (2, &[None, Some(200)])],
1677        );
1678        batch.merge_last_non_null().unwrap();
1679
1680        // Base row is delete, keep it as is and don't merge fields from older rows.
1681        let expect =
1682            new_batch_with_u64_fields(&[1], &[3], &[OpType::Delete], &[(1, &[None]), (2, &[None])]);
1683        assert_eq!(expect, batch);
1684    }
1685
1686    #[test]
1687    fn test_merge_last_non_null_multiple_timestamp_groups() {
1688        let mut batch = new_batch_with_u64_fields(
1689            &[1, 1, 2, 3, 3],
1690            &[5, 4, 3, 2, 1],
1691            &[
1692                OpType::Put,
1693                OpType::Put,
1694                OpType::Put,
1695                OpType::Put,
1696                OpType::Put,
1697            ],
1698            &[
1699                (1, &[None, Some(10), Some(20), None, Some(30)]),
1700                (2, &[Some(100), Some(110), Some(120), None, Some(130)]),
1701            ],
1702        );
1703        batch.merge_last_non_null().unwrap();
1704
1705        let expect = new_batch_with_u64_fields(
1706            &[1, 2, 3],
1707            &[5, 3, 2],
1708            &[OpType::Put, OpType::Put, OpType::Put],
1709            &[
1710                (1, &[Some(10), Some(20), Some(30)]),
1711                (2, &[Some(100), Some(120), Some(130)]),
1712            ],
1713        );
1714        assert_eq!(expect, batch);
1715    }
1716
1717    #[test]
1718    fn test_merge_last_non_null_no_fields() {
1719        let mut batch = new_batch_without_fields(
1720            &[1, 1, 2],
1721            &[3, 2, 1],
1722            &[OpType::Put, OpType::Put, OpType::Put],
1723        );
1724        batch.merge_last_non_null().unwrap();
1725
1726        let expect = new_batch_without_fields(&[1, 2], &[3, 1], &[OpType::Put, OpType::Put]);
1727        assert_eq!(expect, batch);
1728    }
1729
1730    #[test]
1731    fn test_filter() {
1732        // Filters put only.
1733        let mut batch = new_batch(
1734            &[1, 2, 3, 4],
1735            &[11, 12, 13, 14],
1736            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1737            &[21, 22, 23, 24],
1738        );
1739        let predicate = BooleanVector::from_vec(vec![false, false, true, true]);
1740        batch.filter(&predicate).unwrap();
1741        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1742        assert_eq!(expect, batch);
1743
1744        // Filters deletion.
1745        let mut batch = new_batch(
1746            &[1, 2, 3, 4],
1747            &[11, 12, 13, 14],
1748            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1749            &[21, 22, 23, 24],
1750        );
1751        let predicate = BooleanVector::from_vec(vec![false, false, true, true]);
1752        batch.filter(&predicate).unwrap();
1753        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1754        assert_eq!(expect, batch);
1755
1756        // Filters to empty.
1757        let predicate = BooleanVector::from_vec(vec![false, false]);
1758        batch.filter(&predicate).unwrap();
1759        assert!(batch.is_empty());
1760    }
1761
1762    #[test]
1763    fn test_sort_and_dedup() {
1764        let original = new_batch(
1765            &[2, 3, 1, 4, 5, 2],
1766            &[1, 2, 3, 4, 5, 6],
1767            &[
1768                OpType::Put,
1769                OpType::Put,
1770                OpType::Put,
1771                OpType::Put,
1772                OpType::Put,
1773                OpType::Put,
1774            ],
1775            &[21, 22, 23, 24, 25, 26],
1776        );
1777
1778        let mut batch = original.clone();
1779        batch.sort(true).unwrap();
1780        // It should only keep one timestamp 2.
1781        assert_eq!(
1782            new_batch(
1783                &[1, 2, 3, 4, 5],
1784                &[3, 6, 2, 4, 5],
1785                &[
1786                    OpType::Put,
1787                    OpType::Put,
1788                    OpType::Put,
1789                    OpType::Put,
1790                    OpType::Put,
1791                ],
1792                &[23, 26, 22, 24, 25],
1793            ),
1794            batch
1795        );
1796
1797        let mut batch = original.clone();
1798        batch.sort(false).unwrap();
1799
1800        // It should only keep one timestamp 2.
1801        assert_eq!(
1802            new_batch(
1803                &[1, 2, 2, 3, 4, 5],
1804                &[3, 6, 1, 2, 4, 5],
1805                &[
1806                    OpType::Put,
1807                    OpType::Put,
1808                    OpType::Put,
1809                    OpType::Put,
1810                    OpType::Put,
1811                    OpType::Put,
1812                ],
1813                &[23, 26, 21, 22, 24, 25],
1814            ),
1815            batch
1816        );
1817
1818        let original = new_batch(
1819            &[2, 2, 1],
1820            &[1, 6, 1],
1821            &[OpType::Delete, OpType::Put, OpType::Put],
1822            &[21, 22, 23],
1823        );
1824
1825        let mut batch = original.clone();
1826        batch.sort(true).unwrap();
1827        let expect = new_batch(&[1, 2], &[1, 6], &[OpType::Put, OpType::Put], &[23, 22]);
1828        assert_eq!(expect, batch);
1829
1830        let mut batch = original.clone();
1831        batch.sort(false).unwrap();
1832        let expect = new_batch(
1833            &[1, 2, 2],
1834            &[1, 6, 1],
1835            &[OpType::Put, OpType::Put, OpType::Delete],
1836            &[23, 22, 21],
1837        );
1838        assert_eq!(expect, batch);
1839    }
1840
1841    #[test]
1842    fn test_get_value() {
1843        let encodings = [PrimaryKeyEncoding::Dense, PrimaryKeyEncoding::Sparse];
1844
1845        for encoding in encodings {
1846            let codec = build_primary_key_codec_with_fields(
1847                encoding,
1848                [
1849                    (
1850                        ReservedColumnId::table_id(),
1851                        row_converter::SortField::new(ConcreteDataType::uint32_datatype()),
1852                    ),
1853                    (
1854                        ReservedColumnId::tsid(),
1855                        row_converter::SortField::new(ConcreteDataType::uint64_datatype()),
1856                    ),
1857                    (
1858                        100,
1859                        row_converter::SortField::new(ConcreteDataType::string_datatype()),
1860                    ),
1861                    (
1862                        200,
1863                        row_converter::SortField::new(ConcreteDataType::string_datatype()),
1864                    ),
1865                ]
1866                .into_iter(),
1867            );
1868
1869            let values = [
1870                Value::UInt32(1000),
1871                Value::UInt64(2000),
1872                Value::String("abcdefgh".into()),
1873                Value::String("zyxwvu".into()),
1874            ];
1875            let mut buf = vec![];
1876            codec
1877                .encode_values(
1878                    &[
1879                        (ReservedColumnId::table_id(), values[0].clone()),
1880                        (ReservedColumnId::tsid(), values[1].clone()),
1881                        (100, values[2].clone()),
1882                        (200, values[3].clone()),
1883                    ],
1884                    &mut buf,
1885                )
1886                .unwrap();
1887
1888            let field_col_id = 2;
1889            let mut batch = new_batch_builder(
1890                &buf,
1891                &[1, 2, 3],
1892                &[1, 1, 1],
1893                &[OpType::Put, OpType::Put, OpType::Put],
1894                field_col_id,
1895                &[42, 43, 44],
1896            )
1897            .build()
1898            .unwrap();
1899
1900            let v = batch
1901                .pk_col_value(&*codec, 0, ReservedColumnId::table_id())
1902                .unwrap()
1903                .unwrap();
1904            assert_eq!(values[0], *v);
1905
1906            let v = batch
1907                .pk_col_value(&*codec, 1, ReservedColumnId::tsid())
1908                .unwrap()
1909                .unwrap();
1910            assert_eq!(values[1], *v);
1911
1912            let v = batch.pk_col_value(&*codec, 2, 100).unwrap().unwrap();
1913            assert_eq!(values[2], *v);
1914
1915            let v = batch.pk_col_value(&*codec, 3, 200).unwrap().unwrap();
1916            assert_eq!(values[3], *v);
1917
1918            let v = batch.field_col_value(field_col_id).unwrap();
1919            assert_eq!(v.data.get(0), Value::UInt64(42));
1920            assert_eq!(v.data.get(1), Value::UInt64(43));
1921            assert_eq!(v.data.get(2), Value::UInt64(44));
1922        }
1923    }
1924}