Skip to main content

mito2/
read.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Common structs and utilities for reading data.
16
17pub mod batch_adapter;
18pub mod compat;
19pub mod dedup;
20pub mod flat_dedup;
21pub mod flat_merge;
22pub mod flat_projection;
23pub mod last_row;
24pub mod merge;
25pub mod plain_batch;
26pub mod projection;
27pub(crate) mod prune;
28pub(crate) mod pruner;
29pub mod range;
30#[cfg(feature = "test")]
31pub mod range_cache;
32#[cfg(not(feature = "test"))]
33pub(crate) mod range_cache;
34pub mod scan_region;
35pub mod scan_util;
36pub(crate) mod seq_scan;
37pub mod series_scan;
38pub mod stream;
39pub(crate) mod unordered_scan;
40
41use std::collections::{HashMap, HashSet};
42use std::sync::Arc;
43use std::time::Duration;
44
45use api::v1::OpType;
46use async_trait::async_trait;
47use common_time::Timestamp;
48use datafusion_common::arrow::array::UInt8Array;
49use datatypes::arrow;
50use datatypes::arrow::array::{Array, ArrayRef};
51use datatypes::arrow::compute::SortOptions;
52use datatypes::arrow::record_batch::RecordBatch;
53use datatypes::arrow::row::{RowConverter, SortField};
54use datatypes::prelude::{ConcreteDataType, DataType, ScalarVector};
55use datatypes::scalars::ScalarVectorBuilder;
56use datatypes::types::TimestampType;
57use datatypes::value::{Value, ValueRef};
58use datatypes::vectors::{
59    BooleanVector, Helper, TimestampMicrosecondVector, TimestampMillisecondVector,
60    TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampSecondVector,
61    UInt8Vector, UInt8VectorBuilder, UInt32Vector, UInt64Vector, UInt64VectorBuilder, Vector,
62    VectorRef,
63};
64use futures::TryStreamExt;
65use futures::stream::BoxStream;
66use mito_codec::row_converter::{CompositeValues, PrimaryKeyCodec};
67use snafu::{OptionExt, ResultExt, ensure};
68use store_api::metadata::RegionMetadata;
69use store_api::storage::{ColumnId, SequenceNumber, SequenceRange};
70
71use crate::error::{
72    ComputeArrowSnafu, ComputeVectorSnafu, ConvertVectorSnafu, DecodeSnafu, InvalidBatchSnafu,
73    Result,
74};
75use crate::memtable::{BoxedBatchIterator, BoxedRecordBatchIterator};
76use crate::read::prune::PruneReader;
77
78/// Storage internal representation of a batch of rows for a primary key (time series).
79///
80/// Rows are sorted by primary key, timestamp, sequence desc, op_type desc. Fields
81/// always keep the same relative order as fields in [RegionMetadata](store_api::metadata::RegionMetadata).
82#[derive(Debug, PartialEq, Clone)]
83pub struct Batch {
84    /// Primary key encoded in a comparable form.
85    primary_key: Vec<u8>,
86    /// Possibly decoded `primary_key` values. Some places would decode it in advance.
87    pk_values: Option<CompositeValues>,
88    /// Timestamps of rows, should be sorted and not null.
89    timestamps: VectorRef,
90    /// Sequences of rows
91    ///
92    /// UInt64 type, not null.
93    sequences: Arc<UInt64Vector>,
94    /// Op types of rows
95    ///
96    /// UInt8 type, not null.
97    op_types: Arc<UInt8Vector>,
98    /// Fields organized in columnar format.
99    fields: Vec<BatchColumn>,
100    /// Cache for field index lookup.
101    fields_idx: Option<HashMap<ColumnId, usize>>,
102}
103
104impl Batch {
105    /// Creates a new batch.
106    pub fn new(
107        primary_key: Vec<u8>,
108        timestamps: VectorRef,
109        sequences: Arc<UInt64Vector>,
110        op_types: Arc<UInt8Vector>,
111        fields: Vec<BatchColumn>,
112    ) -> Result<Batch> {
113        BatchBuilder::with_required_columns(primary_key, timestamps, sequences, op_types)
114            .with_fields(fields)
115            .build()
116    }
117
118    /// Tries to set fields for the batch.
119    pub fn with_fields(self, fields: Vec<BatchColumn>) -> Result<Batch> {
120        Batch::new(
121            self.primary_key,
122            self.timestamps,
123            self.sequences,
124            self.op_types,
125            fields,
126        )
127    }
128
129    /// Returns primary key of the batch.
130    pub fn primary_key(&self) -> &[u8] {
131        &self.primary_key
132    }
133
134    /// Returns possibly decoded primary-key values.
135    pub fn pk_values(&self) -> Option<&CompositeValues> {
136        self.pk_values.as_ref()
137    }
138
139    /// Sets possibly decoded primary-key values.
140    pub fn set_pk_values(&mut self, pk_values: CompositeValues) {
141        self.pk_values = Some(pk_values);
142    }
143
144    /// Removes possibly decoded primary-key values. For testing only.
145    #[cfg(any(test, feature = "test"))]
146    pub fn remove_pk_values(&mut self) {
147        self.pk_values = None;
148    }
149
150    /// Returns fields in the batch.
151    pub fn fields(&self) -> &[BatchColumn] {
152        &self.fields
153    }
154
155    /// Returns timestamps of the batch.
156    pub fn timestamps(&self) -> &VectorRef {
157        &self.timestamps
158    }
159
160    /// Returns sequences of the batch.
161    pub fn sequences(&self) -> &Arc<UInt64Vector> {
162        &self.sequences
163    }
164
165    /// Returns op types of the batch.
166    pub fn op_types(&self) -> &Arc<UInt8Vector> {
167        &self.op_types
168    }
169
170    /// Returns the number of rows in the batch.
171    pub fn num_rows(&self) -> usize {
172        // All vectors have the same length. We use the length of sequences vector
173        // since it has static type.
174        self.sequences.len()
175    }
176
177    /// Create an empty [`Batch`].
178    #[allow(dead_code)]
179    pub(crate) fn empty() -> Self {
180        Self {
181            primary_key: vec![],
182            pk_values: None,
183            timestamps: Arc::new(TimestampMillisecondVectorBuilder::with_capacity(0).finish()),
184            sequences: Arc::new(UInt64VectorBuilder::with_capacity(0).finish()),
185            op_types: Arc::new(UInt8VectorBuilder::with_capacity(0).finish()),
186            fields: vec![],
187            fields_idx: None,
188        }
189    }
190
191    /// Returns true if the number of rows in the batch is 0.
192    pub fn is_empty(&self) -> bool {
193        self.num_rows() == 0
194    }
195
196    /// Returns the first timestamp in the batch or `None` if the batch is empty.
197    pub fn first_timestamp(&self) -> Option<Timestamp> {
198        if self.timestamps.is_empty() {
199            return None;
200        }
201
202        Some(self.get_timestamp(0))
203    }
204
205    /// Returns the last timestamp in the batch or `None` if the batch is empty.
206    pub fn last_timestamp(&self) -> Option<Timestamp> {
207        if self.timestamps.is_empty() {
208            return None;
209        }
210
211        Some(self.get_timestamp(self.timestamps.len() - 1))
212    }
213
214    /// Returns the first sequence in the batch or `None` if the batch is empty.
215    pub fn first_sequence(&self) -> Option<SequenceNumber> {
216        if self.sequences.is_empty() {
217            return None;
218        }
219
220        Some(self.get_sequence(0))
221    }
222
223    /// Returns the last sequence in the batch or `None` if the batch is empty.
224    pub fn last_sequence(&self) -> Option<SequenceNumber> {
225        if self.sequences.is_empty() {
226            return None;
227        }
228
229        Some(self.get_sequence(self.sequences.len() - 1))
230    }
231
232    /// Replaces the primary key of the batch.
233    ///
234    /// Notice that this [Batch] also contains a maybe-exist `pk_values`.
235    /// Be sure to update that field as well.
236    pub fn set_primary_key(&mut self, primary_key: Vec<u8>) {
237        self.primary_key = primary_key;
238    }
239
240    /// Slice the batch, returning a new batch.
241    ///
242    /// # Panics
243    /// Panics if `offset + length > self.num_rows()`.
244    pub fn slice(&self, offset: usize, length: usize) -> Batch {
245        let fields = self
246            .fields
247            .iter()
248            .map(|column| BatchColumn {
249                column_id: column.column_id,
250                data: column.data.slice(offset, length),
251            })
252            .collect();
253        // We skip using the builder to avoid validating the batch again.
254        Batch {
255            // Now we need to clone the primary key. We could try `Bytes` if
256            // this becomes a bottleneck.
257            primary_key: self.primary_key.clone(),
258            pk_values: self.pk_values.clone(),
259            timestamps: self.timestamps.slice(offset, length),
260            sequences: Arc::new(self.sequences.get_slice(offset, length)),
261            op_types: Arc::new(self.op_types.get_slice(offset, length)),
262            fields,
263            fields_idx: self.fields_idx.clone(),
264        }
265    }
266
267    /// Takes `batches` and concat them into one batch.
268    ///
269    /// All `batches` must have the same primary key.
270    pub fn concat(mut batches: Vec<Batch>) -> Result<Batch> {
271        ensure!(
272            !batches.is_empty(),
273            InvalidBatchSnafu {
274                reason: "empty batches",
275            }
276        );
277        if batches.len() == 1 {
278            // Now we own the `batches` so we could pop it directly.
279            return Ok(batches.pop().unwrap());
280        }
281
282        let primary_key = std::mem::take(&mut batches[0].primary_key);
283        let first = &batches[0];
284        // We took the primary key from the first batch so we don't use `first.primary_key()`.
285        ensure!(
286            batches
287                .iter()
288                .skip(1)
289                .all(|b| b.primary_key() == primary_key),
290            InvalidBatchSnafu {
291                reason: "batches have different primary key",
292            }
293        );
294        for b in batches.iter().skip(1) {
295            ensure!(
296                b.fields.len() == first.fields.len(),
297                InvalidBatchSnafu {
298                    reason: "batches have different field num",
299                }
300            );
301            for (l, r) in b.fields.iter().zip(&first.fields) {
302                ensure!(
303                    l.column_id == r.column_id,
304                    InvalidBatchSnafu {
305                        reason: "batches have different fields",
306                    }
307                );
308            }
309        }
310
311        // We take the primary key from the first batch.
312        let mut builder = BatchBuilder::new(primary_key);
313        // Concat timestamps, sequences, op_types, fields.
314        let array = concat_arrays(batches.iter().map(|b| b.timestamps().to_arrow_array()))?;
315        builder.timestamps_array(array)?;
316        let array = concat_arrays(batches.iter().map(|b| b.sequences().to_arrow_array()))?;
317        builder.sequences_array(array)?;
318        let array = concat_arrays(batches.iter().map(|b| b.op_types().to_arrow_array()))?;
319        builder.op_types_array(array)?;
320        for (i, batch_column) in first.fields.iter().enumerate() {
321            let array = concat_arrays(batches.iter().map(|b| b.fields()[i].data.to_arrow_array()))?;
322            builder.push_field_array(batch_column.column_id, array)?;
323        }
324
325        builder.build()
326    }
327
328    /// Removes rows whose op type is delete.
329    pub fn filter_deleted(&mut self) -> Result<()> {
330        // Safety: op type column is not null.
331        let array = self.op_types.as_arrow();
332        // Find rows with non-delete op type.
333        let rhs = UInt8Array::new_scalar(OpType::Delete as u8);
334        let predicate =
335            arrow::compute::kernels::cmp::neq(array, &rhs).context(ComputeArrowSnafu)?;
336        self.filter(&BooleanVector::from(predicate))
337    }
338
339    // Applies the `predicate` to the batch.
340    // Safety: We know the array type so we unwrap on casting.
341    pub fn filter(&mut self, predicate: &BooleanVector) -> Result<()> {
342        self.timestamps = self
343            .timestamps
344            .filter(predicate)
345            .context(ComputeVectorSnafu)?;
346        self.sequences = Arc::new(
347            UInt64Vector::try_from_arrow_array(
348                arrow::compute::filter(self.sequences.as_arrow(), predicate.as_boolean_array())
349                    .context(ComputeArrowSnafu)?,
350            )
351            .unwrap(),
352        );
353        self.op_types = Arc::new(
354            UInt8Vector::try_from_arrow_array(
355                arrow::compute::filter(self.op_types.as_arrow(), predicate.as_boolean_array())
356                    .context(ComputeArrowSnafu)?,
357            )
358            .unwrap(),
359        );
360        for batch_column in &mut self.fields {
361            batch_column.data = batch_column
362                .data
363                .filter(predicate)
364                .context(ComputeVectorSnafu)?;
365        }
366
367        Ok(())
368    }
369
370    /// Filters rows by the given `sequence`. Only preserves rows with sequence less than or equal to `sequence`.
371    pub fn filter_by_sequence(&mut self, sequence: Option<SequenceRange>) -> Result<()> {
372        let seq_range = match sequence {
373            None => return Ok(()),
374            Some(seq_range) => {
375                let (Some(first), Some(last)) = (self.first_sequence(), self.last_sequence())
376                else {
377                    return Ok(());
378                };
379                let is_subset = match seq_range {
380                    SequenceRange::Gt { min } => min < first,
381                    SequenceRange::LtEq { max } => max >= last,
382                    SequenceRange::GtLtEq { min, max } => min < first && max >= last,
383                };
384                if is_subset {
385                    return Ok(());
386                }
387                seq_range
388            }
389        };
390
391        let seqs = self.sequences.as_arrow();
392        let predicate = seq_range.filter(seqs).context(ComputeArrowSnafu)?;
393
394        let predicate = BooleanVector::from(predicate);
395        self.filter(&predicate)?;
396
397        Ok(())
398    }
399
400    /// Sorts rows in the batch. If `dedup` is true, it also removes
401    /// duplicated rows according to primary keys.
402    ///
403    /// It orders rows by timestamp, sequence desc and only keep the latest
404    /// row for the same timestamp. It doesn't consider op type as sequence
405    /// should already provide uniqueness for a row.
406    pub fn sort(&mut self, dedup: bool) -> Result<()> {
407        // If building a converter each time is costly, we may allow passing a
408        // converter.
409        let converter = RowConverter::new(vec![
410            SortField::new(self.timestamps.data_type().as_arrow_type()),
411            SortField::new_with_options(
412                self.sequences.data_type().as_arrow_type(),
413                SortOptions {
414                    descending: true,
415                    ..Default::default()
416                },
417            ),
418        ])
419        .context(ComputeArrowSnafu)?;
420        // Columns to sort.
421        let columns = [
422            self.timestamps.to_arrow_array(),
423            self.sequences.to_arrow_array(),
424        ];
425        let rows = converter.convert_columns(&columns).unwrap();
426        let mut to_sort: Vec<_> = rows.iter().enumerate().collect();
427
428        let was_sorted = to_sort.is_sorted_by_key(|x| x.1);
429        if !was_sorted {
430            to_sort.sort_unstable_by_key(|x| x.1);
431        }
432
433        let num_rows = to_sort.len();
434        if dedup {
435            // Dedup by timestamps.
436            to_sort.dedup_by(|left, right| {
437                debug_assert_eq!(18, left.1.as_ref().len());
438                debug_assert_eq!(18, right.1.as_ref().len());
439                let (left_key, right_key) = (left.1.as_ref(), right.1.as_ref());
440                // We only compare the timestamp part and ignore sequence.
441                left_key[..TIMESTAMP_KEY_LEN] == right_key[..TIMESTAMP_KEY_LEN]
442            });
443        }
444        let no_dedup = to_sort.len() == num_rows;
445
446        if was_sorted && no_dedup {
447            return Ok(());
448        }
449        let indices = UInt32Vector::from_iter_values(to_sort.iter().map(|v| v.0 as u32));
450        self.take_in_place(&indices)
451    }
452
453    /// Merges duplicated timestamps in the batch by keeping the latest non-null field values.
454    ///
455    /// Rows must already be sorted by timestamp (ascending) and sequence (descending).
456    ///
457    /// This method deduplicates rows with the same timestamp (keeping the first row in each
458    /// timestamp range as the base row) and fills null fields from subsequent rows until all
459    /// fields are filled or a delete operation is encountered.
460    pub(crate) fn merge_last_non_null(&mut self) -> Result<()> {
461        let num_rows = self.num_rows();
462        if num_rows < 2 {
463            return Ok(());
464        }
465
466        let Some(timestamps) = self.timestamps_native() else {
467            return Ok(());
468        };
469
470        // Fast path: check if there are any duplicate timestamps.
471        let mut has_dup = false;
472        let mut group_count = 1;
473        for i in 1..num_rows {
474            has_dup |= timestamps[i] == timestamps[i - 1];
475            group_count += (timestamps[i] != timestamps[i - 1]) as usize;
476        }
477        if !has_dup {
478            return Ok(());
479        }
480
481        let num_fields = self.fields.len();
482        let op_types = self.op_types.as_arrow().values();
483
484        let mut base_indices: Vec<u32> = Vec::with_capacity(group_count);
485        let mut field_indices: Vec<Vec<u32>> = (0..num_fields)
486            .map(|_| Vec::with_capacity(group_count))
487            .collect();
488
489        let mut start = 0;
490        while start < num_rows {
491            let ts = timestamps[start];
492            let mut end = start + 1;
493            while end < num_rows && timestamps[end] == ts {
494                end += 1;
495            }
496
497            let group_pos = base_indices.len();
498            base_indices.push(start as u32);
499
500            if num_fields > 0 {
501                // Default: take the base row for all fields.
502                for idx in &mut field_indices {
503                    idx.push(start as u32);
504                }
505
506                let base_deleted = op_types[start] == OpType::Delete as u8;
507                if !base_deleted {
508                    // Track fields that are null in the base row and try to fill them from older
509                    // rows in the same timestamp range.
510                    let mut missing_fields = Vec::new();
511                    for (field_idx, col) in self.fields.iter().enumerate() {
512                        if col.data.is_null(start) {
513                            missing_fields.push(field_idx);
514                        }
515                    }
516
517                    if !missing_fields.is_empty() {
518                        for row_idx in (start + 1)..end {
519                            if op_types[row_idx] == OpType::Delete as u8 {
520                                break;
521                            }
522
523                            missing_fields.retain(|&field_idx| {
524                                if self.fields[field_idx].data.is_null(row_idx) {
525                                    true
526                                } else {
527                                    field_indices[field_idx][group_pos] = row_idx as u32;
528                                    false
529                                }
530                            });
531
532                            if missing_fields.is_empty() {
533                                break;
534                            }
535                        }
536                    }
537                }
538            }
539
540            start = end;
541        }
542
543        let base_indices = UInt32Vector::from_vec(base_indices);
544        self.timestamps = self
545            .timestamps
546            .take(&base_indices)
547            .context(ComputeVectorSnafu)?;
548        let array = arrow::compute::take(self.sequences.as_arrow(), base_indices.as_arrow(), None)
549            .context(ComputeArrowSnafu)?;
550        // Safety: We know the array and vector type.
551        self.sequences = Arc::new(UInt64Vector::try_from_arrow_array(array).unwrap());
552        let array = arrow::compute::take(self.op_types.as_arrow(), base_indices.as_arrow(), None)
553            .context(ComputeArrowSnafu)?;
554        // Safety: We know the array and vector type.
555        self.op_types = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
556
557        for (field_idx, batch_column) in self.fields.iter_mut().enumerate() {
558            let idx = UInt32Vector::from_vec(std::mem::take(&mut field_indices[field_idx]));
559            batch_column.data = batch_column.data.take(&idx).context(ComputeVectorSnafu)?;
560        }
561
562        Ok(())
563    }
564
565    /// Returns the estimated memory size of the batch.
566    pub fn memory_size(&self) -> usize {
567        let mut size = std::mem::size_of::<Self>();
568        size += self.primary_key.len();
569        size += self.timestamps.memory_size();
570        size += self.sequences.memory_size();
571        size += self.op_types.memory_size();
572        for batch_column in &self.fields {
573            size += batch_column.data.memory_size();
574        }
575        size
576    }
577
578    /// Returns ids and datatypes of fields in the [Batch] after applying the `projection`.
579    pub(crate) fn projected_fields(
580        metadata: &RegionMetadata,
581        projection: &[ColumnId],
582    ) -> Vec<(ColumnId, ConcreteDataType)> {
583        let projected_ids: HashSet<_> = projection.iter().copied().collect();
584        metadata
585            .field_columns()
586            .filter_map(|column| {
587                if projected_ids.contains(&column.column_id) {
588                    Some((column.column_id, column.column_schema.data_type.clone()))
589                } else {
590                    None
591                }
592            })
593            .collect()
594    }
595
596    /// Returns timestamps in a native slice or `None` if the batch is empty.
597    pub(crate) fn timestamps_native(&self) -> Option<&[i64]> {
598        if self.timestamps.is_empty() {
599            return None;
600        }
601
602        let values = match self.timestamps.data_type() {
603            ConcreteDataType::Timestamp(TimestampType::Second(_)) => self
604                .timestamps
605                .as_any()
606                .downcast_ref::<TimestampSecondVector>()
607                .unwrap()
608                .as_arrow()
609                .values(),
610            ConcreteDataType::Timestamp(TimestampType::Millisecond(_)) => self
611                .timestamps
612                .as_any()
613                .downcast_ref::<TimestampMillisecondVector>()
614                .unwrap()
615                .as_arrow()
616                .values(),
617            ConcreteDataType::Timestamp(TimestampType::Microsecond(_)) => self
618                .timestamps
619                .as_any()
620                .downcast_ref::<TimestampMicrosecondVector>()
621                .unwrap()
622                .as_arrow()
623                .values(),
624            ConcreteDataType::Timestamp(TimestampType::Nanosecond(_)) => self
625                .timestamps
626                .as_any()
627                .downcast_ref::<TimestampNanosecondVector>()
628                .unwrap()
629                .as_arrow()
630                .values(),
631            other => panic!("timestamps in a Batch has other type {:?}", other),
632        };
633
634        Some(values)
635    }
636
637    /// Takes the batch in place.
638    fn take_in_place(&mut self, indices: &UInt32Vector) -> Result<()> {
639        self.timestamps = self.timestamps.take(indices).context(ComputeVectorSnafu)?;
640        let array = arrow::compute::take(self.sequences.as_arrow(), indices.as_arrow(), None)
641            .context(ComputeArrowSnafu)?;
642        // Safety: we know the array and vector type.
643        self.sequences = Arc::new(UInt64Vector::try_from_arrow_array(array).unwrap());
644        let array = arrow::compute::take(self.op_types.as_arrow(), indices.as_arrow(), None)
645            .context(ComputeArrowSnafu)?;
646        self.op_types = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
647        for batch_column in &mut self.fields {
648            batch_column.data = batch_column
649                .data
650                .take(indices)
651                .context(ComputeVectorSnafu)?;
652        }
653
654        Ok(())
655    }
656
657    /// Gets a timestamp at given `index`.
658    ///
659    /// # Panics
660    /// Panics if `index` is out-of-bound or the timestamp vector returns null.
661    fn get_timestamp(&self, index: usize) -> Timestamp {
662        match self.timestamps.get_ref(index) {
663            ValueRef::Timestamp(timestamp) => timestamp,
664
665            // We have check the data type is timestamp compatible in the [BatchBuilder] so it's safe to panic.
666            value => panic!("{:?} is not a timestamp", value),
667        }
668    }
669
670    /// Gets a sequence at given `index`.
671    ///
672    /// # Panics
673    /// Panics if `index` is out-of-bound or the sequence vector returns null.
674    pub(crate) fn get_sequence(&self, index: usize) -> SequenceNumber {
675        // Safety: sequences is not null so it actually returns Some.
676        self.sequences.get_data(index).unwrap()
677    }
678
679    /// Checks the batch is monotonic by timestamps.
680    #[cfg(debug_assertions)]
681    #[allow(dead_code)]
682    pub(crate) fn check_monotonic(&self) -> Result<(), String> {
683        use std::cmp::Ordering;
684        if self.timestamps_native().is_none() {
685            return Ok(());
686        }
687
688        let timestamps = self.timestamps_native().unwrap();
689        let sequences = self.sequences.as_arrow().values();
690        for (i, window) in timestamps.windows(2).enumerate() {
691            let current = window[0];
692            let next = window[1];
693            let current_sequence = sequences[i];
694            let next_sequence = sequences[i + 1];
695            match current.cmp(&next) {
696                Ordering::Less => {
697                    // The current timestamp is less than the next timestamp.
698                    continue;
699                }
700                Ordering::Equal => {
701                    // The current timestamp is equal to the next timestamp.
702                    if current_sequence < next_sequence {
703                        return Err(format!(
704                            "sequence are not monotonic: ts {} == {} but current sequence {} < {}, index: {}",
705                            current, next, current_sequence, next_sequence, i
706                        ));
707                    }
708                }
709                Ordering::Greater => {
710                    // The current timestamp is greater than the next timestamp.
711                    return Err(format!(
712                        "timestamps are not monotonic: {} > {}, index: {}",
713                        current, next, i
714                    ));
715                }
716            }
717        }
718
719        Ok(())
720    }
721
722    /// Returns Ok if the given batch is behind the current batch.
723    #[cfg(debug_assertions)]
724    #[allow(dead_code)]
725    pub(crate) fn check_next_batch(&self, other: &Batch) -> Result<(), String> {
726        // Checks the primary key
727        if self.primary_key() < other.primary_key() {
728            return Ok(());
729        }
730        if self.primary_key() > other.primary_key() {
731            return Err(format!(
732                "primary key is not monotonic: {:?} > {:?}",
733                self.primary_key(),
734                other.primary_key()
735            ));
736        }
737        // Checks the timestamp.
738        if self.last_timestamp() < other.first_timestamp() {
739            return Ok(());
740        }
741        if self.last_timestamp() > other.first_timestamp() {
742            return Err(format!(
743                "timestamps are not monotonic: {:?} > {:?}",
744                self.last_timestamp(),
745                other.first_timestamp()
746            ));
747        }
748        // Checks the sequence.
749        if self.last_sequence() >= other.first_sequence() {
750            return Ok(());
751        }
752        Err(format!(
753            "sequences are not monotonic: {:?} < {:?}",
754            self.last_sequence(),
755            other.first_sequence()
756        ))
757    }
758
759    /// Returns the value of the column in the primary key.
760    ///
761    /// Lazily decodes the primary key and caches the result.
762    pub fn pk_col_value(
763        &mut self,
764        codec: &dyn PrimaryKeyCodec,
765        col_idx_in_pk: usize,
766        column_id: ColumnId,
767    ) -> Result<Option<&Value>> {
768        if self.pk_values.is_none() {
769            self.pk_values = Some(codec.decode(&self.primary_key).context(DecodeSnafu)?);
770        }
771
772        let pk_values = self.pk_values.as_ref().unwrap();
773        Ok(match pk_values {
774            CompositeValues::Dense(values) => values.get(col_idx_in_pk).map(|(_, v)| v),
775            CompositeValues::Sparse(values) => values.get(&column_id),
776        })
777    }
778
779    /// Returns values of the field in the batch.
780    ///
781    /// Lazily caches the field index.
782    pub fn field_col_value(&mut self, column_id: ColumnId) -> Option<&BatchColumn> {
783        if self.fields_idx.is_none() {
784            self.fields_idx = Some(
785                self.fields
786                    .iter()
787                    .enumerate()
788                    .map(|(i, c)| (c.column_id, i))
789                    .collect(),
790            );
791        }
792
793        self.fields_idx
794            .as_ref()
795            .unwrap()
796            .get(&column_id)
797            .map(|&idx| &self.fields[idx])
798    }
799}
800
801/// A struct to check the batch is monotonic.
802#[cfg(debug_assertions)]
803#[derive(Default)]
804#[allow(dead_code)]
805pub(crate) struct BatchChecker {
806    last_batch: Option<Batch>,
807    start: Option<Timestamp>,
808    end: Option<Timestamp>,
809}
810
811#[cfg(debug_assertions)]
812#[allow(dead_code)]
813impl BatchChecker {
814    /// Attaches the given start timestamp to the checker.
815    pub(crate) fn with_start(mut self, start: Option<Timestamp>) -> Self {
816        self.start = start;
817        self
818    }
819
820    /// Attaches the given end timestamp to the checker.
821    pub(crate) fn with_end(mut self, end: Option<Timestamp>) -> Self {
822        self.end = end;
823        self
824    }
825
826    /// Returns true if the given batch is monotonic and behind
827    /// the last batch.
828    pub(crate) fn check_monotonic(&mut self, batch: &Batch) -> Result<(), String> {
829        batch.check_monotonic()?;
830
831        if let (Some(start), Some(first)) = (self.start, batch.first_timestamp())
832            && start > first
833        {
834            return Err(format!(
835                "batch's first timestamp is before the start timestamp: {:?} > {:?}",
836                start, first
837            ));
838        }
839        if let (Some(end), Some(last)) = (self.end, batch.last_timestamp())
840            && end <= last
841        {
842            return Err(format!(
843                "batch's last timestamp is after the end timestamp: {:?} <= {:?}",
844                end, last
845            ));
846        }
847
848        // Checks the batch is behind the last batch.
849        // Then Updates the last batch.
850        let res = self
851            .last_batch
852            .as_ref()
853            .map(|last| last.check_next_batch(batch))
854            .unwrap_or(Ok(()));
855        self.last_batch = Some(batch.clone());
856        res
857    }
858
859    /// Formats current batch and last batch for debug.
860    pub(crate) fn format_batch(&self, batch: &Batch) -> String {
861        use std::fmt::Write;
862
863        let mut message = String::new();
864        if let Some(last) = &self.last_batch {
865            write!(
866                message,
867                "last_pk: {:?}, last_ts: {:?}, last_seq: {:?}, ",
868                last.primary_key(),
869                last.last_timestamp(),
870                last.last_sequence()
871            )
872            .unwrap();
873        }
874        write!(
875            message,
876            "batch_pk: {:?}, batch_ts: {:?}, batch_seq: {:?}",
877            batch.primary_key(),
878            batch.timestamps(),
879            batch.sequences()
880        )
881        .unwrap();
882
883        message
884    }
885
886    /// Checks batches from the part range are monotonic. Otherwise, panics.
887    pub(crate) fn ensure_part_range_batch(
888        &mut self,
889        scanner: &str,
890        region_id: store_api::storage::RegionId,
891        partition: usize,
892        part_range: store_api::region_engine::PartitionRange,
893        batch: &Batch,
894    ) {
895        if let Err(e) = self.check_monotonic(batch) {
896            let err_msg = format!(
897                "{}: batch is not sorted, {}, region_id: {}, partition: {}, part_range: {:?}",
898                scanner, e, region_id, partition, part_range,
899            );
900            common_telemetry::error!("{err_msg}, {}", self.format_batch(batch));
901            // Only print the number of row in the panic message.
902            panic!("{err_msg}, batch rows: {}", batch.num_rows());
903        }
904    }
905}
906
907/// Len of timestamp in arrow row format.
908const TIMESTAMP_KEY_LEN: usize = 9;
909
910/// Helper function to concat arrays from `iter`.
911fn concat_arrays(iter: impl Iterator<Item = ArrayRef>) -> Result<ArrayRef> {
912    let arrays: Vec<_> = iter.collect();
913    let dyn_arrays: Vec<_> = arrays.iter().map(|array| array.as_ref()).collect();
914    arrow::compute::concat(&dyn_arrays).context(ComputeArrowSnafu)
915}
916
917/// A column in a [Batch].
918#[derive(Debug, PartialEq, Eq, Clone)]
919pub struct BatchColumn {
920    /// Id of the column.
921    pub column_id: ColumnId,
922    /// Data of the column.
923    pub data: VectorRef,
924}
925
926/// Builder to build [Batch].
927pub struct BatchBuilder {
928    primary_key: Vec<u8>,
929    timestamps: Option<VectorRef>,
930    sequences: Option<Arc<UInt64Vector>>,
931    op_types: Option<Arc<UInt8Vector>>,
932    fields: Vec<BatchColumn>,
933}
934
935impl BatchBuilder {
936    /// Creates a new [BatchBuilder] with primary key.
937    pub fn new(primary_key: Vec<u8>) -> BatchBuilder {
938        BatchBuilder {
939            primary_key,
940            timestamps: None,
941            sequences: None,
942            op_types: None,
943            fields: Vec::new(),
944        }
945    }
946
947    /// Creates a new [BatchBuilder] with all required columns.
948    pub fn with_required_columns(
949        primary_key: Vec<u8>,
950        timestamps: VectorRef,
951        sequences: Arc<UInt64Vector>,
952        op_types: Arc<UInt8Vector>,
953    ) -> BatchBuilder {
954        BatchBuilder {
955            primary_key,
956            timestamps: Some(timestamps),
957            sequences: Some(sequences),
958            op_types: Some(op_types),
959            fields: Vec::new(),
960        }
961    }
962
963    /// Set all field columns.
964    pub fn with_fields(mut self, fields: Vec<BatchColumn>) -> Self {
965        self.fields = fields;
966        self
967    }
968
969    /// Push a field column.
970    pub fn push_field(&mut self, column: BatchColumn) -> &mut Self {
971        self.fields.push(column);
972        self
973    }
974
975    /// Push an array as a field.
976    pub fn push_field_array(&mut self, column_id: ColumnId, array: ArrayRef) -> Result<&mut Self> {
977        let vector = Helper::try_into_vector(array).context(ConvertVectorSnafu)?;
978        self.fields.push(BatchColumn {
979            column_id,
980            data: vector,
981        });
982
983        Ok(self)
984    }
985
986    /// Try to set an array as timestamps.
987    pub fn timestamps_array(&mut self, array: ArrayRef) -> Result<&mut Self> {
988        let vector = Helper::try_into_vector(array).context(ConvertVectorSnafu)?;
989        ensure!(
990            vector.data_type().is_timestamp(),
991            InvalidBatchSnafu {
992                reason: format!("{:?} is not a timestamp type", vector.data_type()),
993            }
994        );
995
996        self.timestamps = Some(vector);
997        Ok(self)
998    }
999
1000    /// Try to set an array as sequences.
1001    pub fn sequences_array(&mut self, array: ArrayRef) -> Result<&mut Self> {
1002        ensure!(
1003            *array.data_type() == arrow::datatypes::DataType::UInt64,
1004            InvalidBatchSnafu {
1005                reason: "sequence array is not UInt64 type",
1006            }
1007        );
1008        // Safety: The cast must success as we have ensured it is uint64 type.
1009        let vector = Arc::new(UInt64Vector::try_from_arrow_array(array).unwrap());
1010        self.sequences = Some(vector);
1011
1012        Ok(self)
1013    }
1014
1015    /// Try to set an array as op types.
1016    pub fn op_types_array(&mut self, array: ArrayRef) -> Result<&mut Self> {
1017        ensure!(
1018            *array.data_type() == arrow::datatypes::DataType::UInt8,
1019            InvalidBatchSnafu {
1020                reason: "sequence array is not UInt8 type",
1021            }
1022        );
1023        // Safety: The cast must success as we have ensured it is uint64 type.
1024        let vector = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
1025        self.op_types = Some(vector);
1026
1027        Ok(self)
1028    }
1029
1030    /// Builds the [Batch].
1031    pub fn build(self) -> Result<Batch> {
1032        let timestamps = self.timestamps.context(InvalidBatchSnafu {
1033            reason: "missing timestamps",
1034        })?;
1035        let sequences = self.sequences.context(InvalidBatchSnafu {
1036            reason: "missing sequences",
1037        })?;
1038        let op_types = self.op_types.context(InvalidBatchSnafu {
1039            reason: "missing op_types",
1040        })?;
1041        // Our storage format ensure these columns are not nullable so
1042        // we use assert here.
1043        assert_eq!(0, timestamps.null_count());
1044        assert_eq!(0, sequences.null_count());
1045        assert_eq!(0, op_types.null_count());
1046
1047        let ts_len = timestamps.len();
1048        ensure!(
1049            sequences.len() == ts_len,
1050            InvalidBatchSnafu {
1051                reason: format!(
1052                    "sequence have different len {} != {}",
1053                    sequences.len(),
1054                    ts_len
1055                ),
1056            }
1057        );
1058        ensure!(
1059            op_types.len() == ts_len,
1060            InvalidBatchSnafu {
1061                reason: format!(
1062                    "op type have different len {} != {}",
1063                    op_types.len(),
1064                    ts_len
1065                ),
1066            }
1067        );
1068        for column in &self.fields {
1069            ensure!(
1070                column.data.len() == ts_len,
1071                InvalidBatchSnafu {
1072                    reason: format!(
1073                        "column {} has different len {} != {}",
1074                        column.column_id,
1075                        column.data.len(),
1076                        ts_len
1077                    ),
1078                }
1079            );
1080        }
1081
1082        Ok(Batch {
1083            primary_key: self.primary_key,
1084            pk_values: None,
1085            timestamps,
1086            sequences,
1087            op_types,
1088            fields: self.fields,
1089            fields_idx: None,
1090        })
1091    }
1092}
1093
1094impl From<Batch> for BatchBuilder {
1095    fn from(batch: Batch) -> Self {
1096        Self {
1097            primary_key: batch.primary_key,
1098            timestamps: Some(batch.timestamps),
1099            sequences: Some(batch.sequences),
1100            op_types: Some(batch.op_types),
1101            fields: batch.fields,
1102        }
1103    }
1104}
1105
1106/// Async [Batch] reader and iterator wrapper.
1107///
1108/// This is the data source for SST writers or internal readers.
1109pub enum Source {
1110    /// Source from a [BoxedBatchReader].
1111    Reader(BoxedBatchReader),
1112    /// Source from a [BoxedBatchIterator].
1113    Iter(BoxedBatchIterator),
1114    /// Source from a [BoxedBatchStream].
1115    Stream(BoxedBatchStream),
1116    /// Source from a [PruneReader].
1117    PruneReader(PruneReader),
1118}
1119
1120impl Source {
1121    /// Returns next [Batch] from this data source.
1122    pub async fn next_batch(&mut self) -> Result<Option<Batch>> {
1123        match self {
1124            Source::Reader(reader) => reader.next_batch().await,
1125            Source::Iter(iter) => iter.next().transpose(),
1126            Source::Stream(stream) => stream.try_next().await,
1127            Source::PruneReader(reader) => reader.next_batch().await,
1128        }
1129    }
1130}
1131
1132/// Async [RecordBatch] reader and iterator wrapper for flat format.
1133pub enum FlatSource {
1134    /// Source from a [BoxedRecordBatchIterator].
1135    Iter(BoxedRecordBatchIterator),
1136    /// Source from a [BoxedRecordBatchStream].
1137    Stream(BoxedRecordBatchStream),
1138}
1139
1140impl FlatSource {
1141    /// Returns next [RecordBatch] from this data source.
1142    pub async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
1143        match self {
1144            FlatSource::Iter(iter) => iter.next().transpose(),
1145            FlatSource::Stream(stream) => stream.try_next().await,
1146        }
1147    }
1148}
1149
1150/// Async batch reader.
1151///
1152/// The reader must guarantee [Batch]es returned by it have the same schema.
1153#[async_trait]
1154pub trait BatchReader: Send {
1155    /// Fetch next [Batch].
1156    ///
1157    /// Returns `Ok(None)` when the reader has reached its end and calling `next_batch()`
1158    /// again won't return batch again.
1159    ///
1160    /// If `Err` is returned, caller should not call this method again, the implementor
1161    /// may or may not panic in such case.
1162    async fn next_batch(&mut self) -> Result<Option<Batch>>;
1163}
1164
1165/// Pointer to [BatchReader].
1166pub type BoxedBatchReader = Box<dyn BatchReader>;
1167
1168/// Pointer to a stream that yields [Batch].
1169pub type BoxedBatchStream = BoxStream<'static, Result<Batch>>;
1170
1171/// Pointer to a stream that yields [RecordBatch].
1172pub type BoxedRecordBatchStream = BoxStream<'static, Result<RecordBatch>>;
1173
1174#[async_trait::async_trait]
1175impl<T: BatchReader + ?Sized> BatchReader for Box<T> {
1176    async fn next_batch(&mut self) -> Result<Option<Batch>> {
1177        (**self).next_batch().await
1178    }
1179}
1180
1181/// Local metrics for scanners.
1182#[derive(Debug, Default)]
1183pub(crate) struct ScannerMetrics {
1184    /// Duration to scan data.
1185    scan_cost: Duration,
1186    /// Duration while waiting for `yield`.
1187    yield_cost: Duration,
1188    /// Number of batches returned.
1189    num_batches: usize,
1190    /// Number of rows returned.
1191    num_rows: usize,
1192}
1193
1194#[cfg(test)]
1195mod tests {
1196    use datatypes::arrow::array::{TimestampMillisecondArray, UInt8Array, UInt64Array};
1197    use mito_codec::row_converter::{self, build_primary_key_codec_with_fields};
1198    use store_api::codec::PrimaryKeyEncoding;
1199    use store_api::storage::consts::ReservedColumnId;
1200
1201    use super::*;
1202    use crate::error::Error;
1203    use crate::test_util::new_batch_builder;
1204
1205    fn new_batch(
1206        timestamps: &[i64],
1207        sequences: &[u64],
1208        op_types: &[OpType],
1209        field: &[u64],
1210    ) -> Batch {
1211        new_batch_builder(b"test", timestamps, sequences, op_types, 1, field)
1212            .build()
1213            .unwrap()
1214    }
1215
1216    fn new_batch_with_u64_fields(
1217        timestamps: &[i64],
1218        sequences: &[u64],
1219        op_types: &[OpType],
1220        fields: &[(ColumnId, &[Option<u64>])],
1221    ) -> Batch {
1222        assert_eq!(timestamps.len(), sequences.len());
1223        assert_eq!(timestamps.len(), op_types.len());
1224        for (_, values) in fields {
1225            assert_eq!(timestamps.len(), values.len());
1226        }
1227
1228        let mut builder = BatchBuilder::new(b"test".to_vec());
1229        builder
1230            .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values(
1231                timestamps.iter().copied(),
1232            )))
1233            .unwrap()
1234            .sequences_array(Arc::new(UInt64Array::from_iter_values(
1235                sequences.iter().copied(),
1236            )))
1237            .unwrap()
1238            .op_types_array(Arc::new(UInt8Array::from_iter_values(
1239                op_types.iter().map(|v| *v as u8),
1240            )))
1241            .unwrap();
1242
1243        for (col_id, values) in fields {
1244            builder
1245                .push_field_array(*col_id, Arc::new(UInt64Array::from(values.to_vec())))
1246                .unwrap();
1247        }
1248
1249        builder.build().unwrap()
1250    }
1251
1252    fn new_batch_without_fields(
1253        timestamps: &[i64],
1254        sequences: &[u64],
1255        op_types: &[OpType],
1256    ) -> Batch {
1257        assert_eq!(timestamps.len(), sequences.len());
1258        assert_eq!(timestamps.len(), op_types.len());
1259
1260        let mut builder = BatchBuilder::new(b"test".to_vec());
1261        builder
1262            .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values(
1263                timestamps.iter().copied(),
1264            )))
1265            .unwrap()
1266            .sequences_array(Arc::new(UInt64Array::from_iter_values(
1267                sequences.iter().copied(),
1268            )))
1269            .unwrap()
1270            .op_types_array(Arc::new(UInt8Array::from_iter_values(
1271                op_types.iter().map(|v| *v as u8),
1272            )))
1273            .unwrap();
1274
1275        builder.build().unwrap()
1276    }
1277
1278    #[test]
1279    fn test_empty_batch() {
1280        let batch = Batch::empty();
1281        assert!(batch.is_empty());
1282        assert_eq!(None, batch.first_timestamp());
1283        assert_eq!(None, batch.last_timestamp());
1284        assert_eq!(None, batch.first_sequence());
1285        assert_eq!(None, batch.last_sequence());
1286        assert!(batch.timestamps_native().is_none());
1287    }
1288
1289    #[test]
1290    fn test_first_last_one() {
1291        let batch = new_batch(&[1], &[2], &[OpType::Put], &[4]);
1292        assert_eq!(
1293            Timestamp::new_millisecond(1),
1294            batch.first_timestamp().unwrap()
1295        );
1296        assert_eq!(
1297            Timestamp::new_millisecond(1),
1298            batch.last_timestamp().unwrap()
1299        );
1300        assert_eq!(2, batch.first_sequence().unwrap());
1301        assert_eq!(2, batch.last_sequence().unwrap());
1302    }
1303
1304    #[test]
1305    fn test_first_last_multiple() {
1306        let batch = new_batch(
1307            &[1, 2, 3],
1308            &[11, 12, 13],
1309            &[OpType::Put, OpType::Put, OpType::Put],
1310            &[21, 22, 23],
1311        );
1312        assert_eq!(
1313            Timestamp::new_millisecond(1),
1314            batch.first_timestamp().unwrap()
1315        );
1316        assert_eq!(
1317            Timestamp::new_millisecond(3),
1318            batch.last_timestamp().unwrap()
1319        );
1320        assert_eq!(11, batch.first_sequence().unwrap());
1321        assert_eq!(13, batch.last_sequence().unwrap());
1322    }
1323
1324    #[test]
1325    fn test_slice() {
1326        let batch = new_batch(
1327            &[1, 2, 3, 4],
1328            &[11, 12, 13, 14],
1329            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1330            &[21, 22, 23, 24],
1331        );
1332        let batch = batch.slice(1, 2);
1333        let expect = new_batch(
1334            &[2, 3],
1335            &[12, 13],
1336            &[OpType::Delete, OpType::Put],
1337            &[22, 23],
1338        );
1339        assert_eq!(expect, batch);
1340    }
1341
1342    #[test]
1343    fn test_timestamps_native() {
1344        let batch = new_batch(
1345            &[1, 2, 3, 4],
1346            &[11, 12, 13, 14],
1347            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1348            &[21, 22, 23, 24],
1349        );
1350        assert_eq!(&[1, 2, 3, 4], batch.timestamps_native().unwrap());
1351    }
1352
1353    #[test]
1354    fn test_concat_empty() {
1355        let err = Batch::concat(vec![]).unwrap_err();
1356        assert!(
1357            matches!(err, Error::InvalidBatch { .. }),
1358            "unexpected err: {err}"
1359        );
1360    }
1361
1362    #[test]
1363    fn test_concat_one() {
1364        let batch = new_batch(&[], &[], &[], &[]);
1365        let actual = Batch::concat(vec![batch.clone()]).unwrap();
1366        assert_eq!(batch, actual);
1367
1368        let batch = new_batch(&[1, 2], &[11, 12], &[OpType::Put, OpType::Put], &[21, 22]);
1369        let actual = Batch::concat(vec![batch.clone()]).unwrap();
1370        assert_eq!(batch, actual);
1371    }
1372
1373    #[test]
1374    fn test_concat_multiple() {
1375        let batches = vec![
1376            new_batch(&[1, 2], &[11, 12], &[OpType::Put, OpType::Put], &[21, 22]),
1377            new_batch(
1378                &[3, 4, 5],
1379                &[13, 14, 15],
1380                &[OpType::Put, OpType::Delete, OpType::Put],
1381                &[23, 24, 25],
1382            ),
1383            new_batch(&[], &[], &[], &[]),
1384            new_batch(&[6], &[16], &[OpType::Put], &[26]),
1385        ];
1386        let batch = Batch::concat(batches).unwrap();
1387        let expect = new_batch(
1388            &[1, 2, 3, 4, 5, 6],
1389            &[11, 12, 13, 14, 15, 16],
1390            &[
1391                OpType::Put,
1392                OpType::Put,
1393                OpType::Put,
1394                OpType::Delete,
1395                OpType::Put,
1396                OpType::Put,
1397            ],
1398            &[21, 22, 23, 24, 25, 26],
1399        );
1400        assert_eq!(expect, batch);
1401    }
1402
1403    #[test]
1404    fn test_concat_different() {
1405        let batch1 = new_batch(&[1], &[1], &[OpType::Put], &[1]);
1406        let mut batch2 = new_batch(&[2], &[2], &[OpType::Put], &[2]);
1407        batch2.primary_key = b"hello".to_vec();
1408        let err = Batch::concat(vec![batch1, batch2]).unwrap_err();
1409        assert!(
1410            matches!(err, Error::InvalidBatch { .. }),
1411            "unexpected err: {err}"
1412        );
1413    }
1414
1415    #[test]
1416    fn test_concat_different_fields() {
1417        let batch1 = new_batch(&[1], &[1], &[OpType::Put], &[1]);
1418        let fields = vec![
1419            batch1.fields()[0].clone(),
1420            BatchColumn {
1421                column_id: 2,
1422                data: Arc::new(UInt64Vector::from_slice([2])),
1423            },
1424        ];
1425        // Batch 2 has more fields.
1426        let batch2 = batch1.clone().with_fields(fields).unwrap();
1427        let err = Batch::concat(vec![batch1.clone(), batch2]).unwrap_err();
1428        assert!(
1429            matches!(err, Error::InvalidBatch { .. }),
1430            "unexpected err: {err}"
1431        );
1432
1433        // Batch 2 has different field.
1434        let fields = vec![BatchColumn {
1435            column_id: 2,
1436            data: Arc::new(UInt64Vector::from_slice([2])),
1437        }];
1438        let batch2 = batch1.clone().with_fields(fields).unwrap();
1439        let err = Batch::concat(vec![batch1, batch2]).unwrap_err();
1440        assert!(
1441            matches!(err, Error::InvalidBatch { .. }),
1442            "unexpected err: {err}"
1443        );
1444    }
1445
1446    #[test]
1447    fn test_filter_deleted_empty() {
1448        let mut batch = new_batch(&[], &[], &[], &[]);
1449        batch.filter_deleted().unwrap();
1450        assert!(batch.is_empty());
1451    }
1452
1453    #[test]
1454    fn test_filter_deleted() {
1455        let mut batch = new_batch(
1456            &[1, 2, 3, 4],
1457            &[11, 12, 13, 14],
1458            &[OpType::Delete, OpType::Put, OpType::Delete, OpType::Put],
1459            &[21, 22, 23, 24],
1460        );
1461        batch.filter_deleted().unwrap();
1462        let expect = new_batch(&[2, 4], &[12, 14], &[OpType::Put, OpType::Put], &[22, 24]);
1463        assert_eq!(expect, batch);
1464
1465        let mut batch = new_batch(
1466            &[1, 2, 3, 4],
1467            &[11, 12, 13, 14],
1468            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1469            &[21, 22, 23, 24],
1470        );
1471        let expect = batch.clone();
1472        batch.filter_deleted().unwrap();
1473        assert_eq!(expect, batch);
1474    }
1475
1476    #[test]
1477    fn test_filter_by_sequence() {
1478        // Filters put only.
1479        let mut batch = new_batch(
1480            &[1, 2, 3, 4],
1481            &[11, 12, 13, 14],
1482            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1483            &[21, 22, 23, 24],
1484        );
1485        batch
1486            .filter_by_sequence(Some(SequenceRange::LtEq { max: 13 }))
1487            .unwrap();
1488        let expect = new_batch(
1489            &[1, 2, 3],
1490            &[11, 12, 13],
1491            &[OpType::Put, OpType::Put, OpType::Put],
1492            &[21, 22, 23],
1493        );
1494        assert_eq!(expect, batch);
1495
1496        // Filters to empty.
1497        let mut batch = new_batch(
1498            &[1, 2, 3, 4],
1499            &[11, 12, 13, 14],
1500            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1501            &[21, 22, 23, 24],
1502        );
1503
1504        batch
1505            .filter_by_sequence(Some(SequenceRange::LtEq { max: 10 }))
1506            .unwrap();
1507        assert!(batch.is_empty());
1508
1509        // None filter.
1510        let mut batch = new_batch(
1511            &[1, 2, 3, 4],
1512            &[11, 12, 13, 14],
1513            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1514            &[21, 22, 23, 24],
1515        );
1516        let expect = batch.clone();
1517        batch.filter_by_sequence(None).unwrap();
1518        assert_eq!(expect, batch);
1519
1520        // Filter a empty batch
1521        let mut batch = new_batch(&[], &[], &[], &[]);
1522        batch
1523            .filter_by_sequence(Some(SequenceRange::LtEq { max: 10 }))
1524            .unwrap();
1525        assert!(batch.is_empty());
1526
1527        // Filter a empty batch with None
1528        let mut batch = new_batch(&[], &[], &[], &[]);
1529        batch.filter_by_sequence(None).unwrap();
1530        assert!(batch.is_empty());
1531
1532        // Test From variant - exclusive lower bound
1533        let mut batch = new_batch(
1534            &[1, 2, 3, 4],
1535            &[11, 12, 13, 14],
1536            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1537            &[21, 22, 23, 24],
1538        );
1539        batch
1540            .filter_by_sequence(Some(SequenceRange::Gt { min: 12 }))
1541            .unwrap();
1542        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1543        assert_eq!(expect, batch);
1544
1545        // Test From variant with no matches
1546        let mut batch = new_batch(
1547            &[1, 2, 3, 4],
1548            &[11, 12, 13, 14],
1549            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1550            &[21, 22, 23, 24],
1551        );
1552        batch
1553            .filter_by_sequence(Some(SequenceRange::Gt { min: 20 }))
1554            .unwrap();
1555        assert!(batch.is_empty());
1556
1557        // Test Range variant - exclusive lower bound, inclusive upper bound
1558        let mut batch = new_batch(
1559            &[1, 2, 3, 4, 5],
1560            &[11, 12, 13, 14, 15],
1561            &[
1562                OpType::Put,
1563                OpType::Put,
1564                OpType::Put,
1565                OpType::Put,
1566                OpType::Put,
1567            ],
1568            &[21, 22, 23, 24, 25],
1569        );
1570        batch
1571            .filter_by_sequence(Some(SequenceRange::GtLtEq { min: 12, max: 14 }))
1572            .unwrap();
1573        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1574        assert_eq!(expect, batch);
1575
1576        // Test Range variant with mixed operations
1577        let mut batch = new_batch(
1578            &[1, 2, 3, 4, 5],
1579            &[11, 12, 13, 14, 15],
1580            &[
1581                OpType::Put,
1582                OpType::Delete,
1583                OpType::Put,
1584                OpType::Delete,
1585                OpType::Put,
1586            ],
1587            &[21, 22, 23, 24, 25],
1588        );
1589        batch
1590            .filter_by_sequence(Some(SequenceRange::GtLtEq { min: 11, max: 13 }))
1591            .unwrap();
1592        let expect = new_batch(
1593            &[2, 3],
1594            &[12, 13],
1595            &[OpType::Delete, OpType::Put],
1596            &[22, 23],
1597        );
1598        assert_eq!(expect, batch);
1599
1600        // Test Range variant with no matches
1601        let mut batch = new_batch(
1602            &[1, 2, 3, 4],
1603            &[11, 12, 13, 14],
1604            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1605            &[21, 22, 23, 24],
1606        );
1607        batch
1608            .filter_by_sequence(Some(SequenceRange::GtLtEq { min: 20, max: 25 }))
1609            .unwrap();
1610        assert!(batch.is_empty());
1611    }
1612
1613    #[test]
1614    fn test_merge_last_non_null_no_dup() {
1615        let mut batch = new_batch_with_u64_fields(
1616            &[1, 2],
1617            &[2, 1],
1618            &[OpType::Put, OpType::Put],
1619            &[(1, &[Some(10), None]), (2, &[Some(100), Some(200)])],
1620        );
1621        let expect = batch.clone();
1622        batch.merge_last_non_null().unwrap();
1623        assert_eq!(expect, batch);
1624    }
1625
1626    #[test]
1627    fn test_merge_last_non_null_fill_null_fields() {
1628        // Rows are already sorted by timestamp asc and sequence desc.
1629        let mut batch = new_batch_with_u64_fields(
1630            &[1, 1, 1],
1631            &[3, 2, 1],
1632            &[OpType::Put, OpType::Put, OpType::Put],
1633            &[
1634                (1, &[None, Some(10), Some(11)]),
1635                (2, &[Some(100), Some(200), Some(300)]),
1636            ],
1637        );
1638        batch.merge_last_non_null().unwrap();
1639
1640        // Field 1 is filled from the first older row (seq=2). Field 2 keeps the base value.
1641        // Filled fields must not be overwritten by even older duplicates.
1642        let expect = new_batch_with_u64_fields(
1643            &[1],
1644            &[3],
1645            &[OpType::Put],
1646            &[(1, &[Some(10)]), (2, &[Some(100)])],
1647        );
1648        assert_eq!(expect, batch);
1649    }
1650
1651    #[test]
1652    fn test_merge_last_non_null_stop_at_delete_row() {
1653        // A delete row in older duplicates should stop filling to avoid resurrecting values before
1654        // deletion.
1655        let mut batch = new_batch_with_u64_fields(
1656            &[1, 1, 1],
1657            &[3, 2, 1],
1658            &[OpType::Put, OpType::Delete, OpType::Put],
1659            &[
1660                (1, &[None, Some(10), Some(11)]),
1661                (2, &[Some(100), Some(200), Some(300)]),
1662            ],
1663        );
1664        batch.merge_last_non_null().unwrap();
1665
1666        let expect = new_batch_with_u64_fields(
1667            &[1],
1668            &[3],
1669            &[OpType::Put],
1670            &[(1, &[None]), (2, &[Some(100)])],
1671        );
1672        assert_eq!(expect, batch);
1673    }
1674
1675    #[test]
1676    fn test_merge_last_non_null_base_delete_no_merge() {
1677        let mut batch = new_batch_with_u64_fields(
1678            &[1, 1],
1679            &[3, 2],
1680            &[OpType::Delete, OpType::Put],
1681            &[(1, &[None, Some(10)]), (2, &[None, Some(200)])],
1682        );
1683        batch.merge_last_non_null().unwrap();
1684
1685        // Base row is delete, keep it as is and don't merge fields from older rows.
1686        let expect =
1687            new_batch_with_u64_fields(&[1], &[3], &[OpType::Delete], &[(1, &[None]), (2, &[None])]);
1688        assert_eq!(expect, batch);
1689    }
1690
1691    #[test]
1692    fn test_merge_last_non_null_multiple_timestamp_groups() {
1693        let mut batch = new_batch_with_u64_fields(
1694            &[1, 1, 2, 3, 3],
1695            &[5, 4, 3, 2, 1],
1696            &[
1697                OpType::Put,
1698                OpType::Put,
1699                OpType::Put,
1700                OpType::Put,
1701                OpType::Put,
1702            ],
1703            &[
1704                (1, &[None, Some(10), Some(20), None, Some(30)]),
1705                (2, &[Some(100), Some(110), Some(120), None, Some(130)]),
1706            ],
1707        );
1708        batch.merge_last_non_null().unwrap();
1709
1710        let expect = new_batch_with_u64_fields(
1711            &[1, 2, 3],
1712            &[5, 3, 2],
1713            &[OpType::Put, OpType::Put, OpType::Put],
1714            &[
1715                (1, &[Some(10), Some(20), Some(30)]),
1716                (2, &[Some(100), Some(120), Some(130)]),
1717            ],
1718        );
1719        assert_eq!(expect, batch);
1720    }
1721
1722    #[test]
1723    fn test_merge_last_non_null_no_fields() {
1724        let mut batch = new_batch_without_fields(
1725            &[1, 1, 2],
1726            &[3, 2, 1],
1727            &[OpType::Put, OpType::Put, OpType::Put],
1728        );
1729        batch.merge_last_non_null().unwrap();
1730
1731        let expect = new_batch_without_fields(&[1, 2], &[3, 1], &[OpType::Put, OpType::Put]);
1732        assert_eq!(expect, batch);
1733    }
1734
1735    #[test]
1736    fn test_filter() {
1737        // Filters put only.
1738        let mut batch = new_batch(
1739            &[1, 2, 3, 4],
1740            &[11, 12, 13, 14],
1741            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1742            &[21, 22, 23, 24],
1743        );
1744        let predicate = BooleanVector::from_vec(vec![false, false, true, true]);
1745        batch.filter(&predicate).unwrap();
1746        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1747        assert_eq!(expect, batch);
1748
1749        // Filters deletion.
1750        let mut batch = new_batch(
1751            &[1, 2, 3, 4],
1752            &[11, 12, 13, 14],
1753            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1754            &[21, 22, 23, 24],
1755        );
1756        let predicate = BooleanVector::from_vec(vec![false, false, true, true]);
1757        batch.filter(&predicate).unwrap();
1758        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1759        assert_eq!(expect, batch);
1760
1761        // Filters to empty.
1762        let predicate = BooleanVector::from_vec(vec![false, false]);
1763        batch.filter(&predicate).unwrap();
1764        assert!(batch.is_empty());
1765    }
1766
1767    #[test]
1768    fn test_sort_and_dedup() {
1769        let original = new_batch(
1770            &[2, 3, 1, 4, 5, 2],
1771            &[1, 2, 3, 4, 5, 6],
1772            &[
1773                OpType::Put,
1774                OpType::Put,
1775                OpType::Put,
1776                OpType::Put,
1777                OpType::Put,
1778                OpType::Put,
1779            ],
1780            &[21, 22, 23, 24, 25, 26],
1781        );
1782
1783        let mut batch = original.clone();
1784        batch.sort(true).unwrap();
1785        // It should only keep one timestamp 2.
1786        assert_eq!(
1787            new_batch(
1788                &[1, 2, 3, 4, 5],
1789                &[3, 6, 2, 4, 5],
1790                &[
1791                    OpType::Put,
1792                    OpType::Put,
1793                    OpType::Put,
1794                    OpType::Put,
1795                    OpType::Put,
1796                ],
1797                &[23, 26, 22, 24, 25],
1798            ),
1799            batch
1800        );
1801
1802        let mut batch = original.clone();
1803        batch.sort(false).unwrap();
1804
1805        // It should only keep one timestamp 2.
1806        assert_eq!(
1807            new_batch(
1808                &[1, 2, 2, 3, 4, 5],
1809                &[3, 6, 1, 2, 4, 5],
1810                &[
1811                    OpType::Put,
1812                    OpType::Put,
1813                    OpType::Put,
1814                    OpType::Put,
1815                    OpType::Put,
1816                    OpType::Put,
1817                ],
1818                &[23, 26, 21, 22, 24, 25],
1819            ),
1820            batch
1821        );
1822
1823        let original = new_batch(
1824            &[2, 2, 1],
1825            &[1, 6, 1],
1826            &[OpType::Delete, OpType::Put, OpType::Put],
1827            &[21, 22, 23],
1828        );
1829
1830        let mut batch = original.clone();
1831        batch.sort(true).unwrap();
1832        let expect = new_batch(&[1, 2], &[1, 6], &[OpType::Put, OpType::Put], &[23, 22]);
1833        assert_eq!(expect, batch);
1834
1835        let mut batch = original.clone();
1836        batch.sort(false).unwrap();
1837        let expect = new_batch(
1838            &[1, 2, 2],
1839            &[1, 6, 1],
1840            &[OpType::Put, OpType::Put, OpType::Delete],
1841            &[23, 22, 21],
1842        );
1843        assert_eq!(expect, batch);
1844    }
1845
1846    #[test]
1847    fn test_get_value() {
1848        let encodings = [PrimaryKeyEncoding::Dense, PrimaryKeyEncoding::Sparse];
1849
1850        for encoding in encodings {
1851            let codec = build_primary_key_codec_with_fields(
1852                encoding,
1853                [
1854                    (
1855                        ReservedColumnId::table_id(),
1856                        row_converter::SortField::new(ConcreteDataType::uint32_datatype()),
1857                    ),
1858                    (
1859                        ReservedColumnId::tsid(),
1860                        row_converter::SortField::new(ConcreteDataType::uint64_datatype()),
1861                    ),
1862                    (
1863                        100,
1864                        row_converter::SortField::new(ConcreteDataType::string_datatype()),
1865                    ),
1866                    (
1867                        200,
1868                        row_converter::SortField::new(ConcreteDataType::string_datatype()),
1869                    ),
1870                ]
1871                .into_iter(),
1872            );
1873
1874            let values = [
1875                Value::UInt32(1000),
1876                Value::UInt64(2000),
1877                Value::String("abcdefgh".into()),
1878                Value::String("zyxwvu".into()),
1879            ];
1880            let mut buf = vec![];
1881            codec
1882                .encode_values(
1883                    &[
1884                        (ReservedColumnId::table_id(), values[0].clone()),
1885                        (ReservedColumnId::tsid(), values[1].clone()),
1886                        (100, values[2].clone()),
1887                        (200, values[3].clone()),
1888                    ],
1889                    &mut buf,
1890                )
1891                .unwrap();
1892
1893            let field_col_id = 2;
1894            let mut batch = new_batch_builder(
1895                &buf,
1896                &[1, 2, 3],
1897                &[1, 1, 1],
1898                &[OpType::Put, OpType::Put, OpType::Put],
1899                field_col_id,
1900                &[42, 43, 44],
1901            )
1902            .build()
1903            .unwrap();
1904
1905            let v = batch
1906                .pk_col_value(&*codec, 0, ReservedColumnId::table_id())
1907                .unwrap()
1908                .unwrap();
1909            assert_eq!(values[0], *v);
1910
1911            let v = batch
1912                .pk_col_value(&*codec, 1, ReservedColumnId::tsid())
1913                .unwrap()
1914                .unwrap();
1915            assert_eq!(values[1], *v);
1916
1917            let v = batch.pk_col_value(&*codec, 2, 100).unwrap().unwrap();
1918            assert_eq!(values[2], *v);
1919
1920            let v = batch.pk_col_value(&*codec, 3, 200).unwrap().unwrap();
1921            assert_eq!(values[3], *v);
1922
1923            let v = batch.field_col_value(field_col_id).unwrap();
1924            assert_eq!(v.data.get(0), Value::UInt64(42));
1925            assert_eq!(v.data.get(1), Value::UInt64(43));
1926            assert_eq!(v.data.get(2), Value::UInt64(44));
1927        }
1928    }
1929}