mito2/
read.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Common structs and utilities for reading data.
16
17pub mod batch_adapter;
18pub mod compat;
19pub mod dedup;
20pub mod flat_dedup;
21pub mod flat_merge;
22pub mod flat_projection;
23pub mod last_row;
24pub mod merge;
25pub mod plain_batch;
26pub mod projection;
27pub(crate) mod prune;
28pub(crate) mod pruner;
29pub mod range;
30pub mod scan_region;
31pub mod scan_util;
32pub(crate) mod seq_scan;
33pub mod series_scan;
34pub mod stream;
35pub(crate) mod unordered_scan;
36
37use std::collections::{HashMap, HashSet};
38use std::sync::Arc;
39use std::time::Duration;
40
41use api::v1::OpType;
42use async_trait::async_trait;
43use common_time::Timestamp;
44use datafusion_common::arrow::array::UInt8Array;
45use datatypes::arrow;
46use datatypes::arrow::array::{Array, ArrayRef};
47use datatypes::arrow::compute::SortOptions;
48use datatypes::arrow::record_batch::RecordBatch;
49use datatypes::arrow::row::{RowConverter, SortField};
50use datatypes::prelude::{ConcreteDataType, DataType, ScalarVector};
51use datatypes::scalars::ScalarVectorBuilder;
52use datatypes::types::TimestampType;
53use datatypes::value::{Value, ValueRef};
54use datatypes::vectors::{
55    BooleanVector, Helper, TimestampMicrosecondVector, TimestampMillisecondVector,
56    TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampSecondVector,
57    UInt8Vector, UInt8VectorBuilder, UInt32Vector, UInt64Vector, UInt64VectorBuilder, Vector,
58    VectorRef,
59};
60use futures::TryStreamExt;
61use futures::stream::BoxStream;
62use mito_codec::row_converter::{CompositeValues, PrimaryKeyCodec};
63use snafu::{OptionExt, ResultExt, ensure};
64use store_api::metadata::RegionMetadata;
65use store_api::storage::{ColumnId, SequenceNumber, SequenceRange};
66
67use crate::error::{
68    ComputeArrowSnafu, ComputeVectorSnafu, ConvertVectorSnafu, DecodeSnafu, InvalidBatchSnafu,
69    Result,
70};
71use crate::memtable::{BoxedBatchIterator, BoxedRecordBatchIterator};
72use crate::read::prune::PruneReader;
73
74/// Storage internal representation of a batch of rows for a primary key (time series).
75///
76/// Rows are sorted by primary key, timestamp, sequence desc, op_type desc. Fields
77/// always keep the same relative order as fields in [RegionMetadata](store_api::metadata::RegionMetadata).
78#[derive(Debug, PartialEq, Clone)]
79pub struct Batch {
80    /// Primary key encoded in a comparable form.
81    primary_key: Vec<u8>,
82    /// Possibly decoded `primary_key` values. Some places would decode it in advance.
83    pk_values: Option<CompositeValues>,
84    /// Timestamps of rows, should be sorted and not null.
85    timestamps: VectorRef,
86    /// Sequences of rows
87    ///
88    /// UInt64 type, not null.
89    sequences: Arc<UInt64Vector>,
90    /// Op types of rows
91    ///
92    /// UInt8 type, not null.
93    op_types: Arc<UInt8Vector>,
94    /// Fields organized in columnar format.
95    fields: Vec<BatchColumn>,
96    /// Cache for field index lookup.
97    fields_idx: Option<HashMap<ColumnId, usize>>,
98}
99
100impl Batch {
101    /// Creates a new batch.
102    pub fn new(
103        primary_key: Vec<u8>,
104        timestamps: VectorRef,
105        sequences: Arc<UInt64Vector>,
106        op_types: Arc<UInt8Vector>,
107        fields: Vec<BatchColumn>,
108    ) -> Result<Batch> {
109        BatchBuilder::with_required_columns(primary_key, timestamps, sequences, op_types)
110            .with_fields(fields)
111            .build()
112    }
113
114    /// Tries to set fields for the batch.
115    pub fn with_fields(self, fields: Vec<BatchColumn>) -> Result<Batch> {
116        Batch::new(
117            self.primary_key,
118            self.timestamps,
119            self.sequences,
120            self.op_types,
121            fields,
122        )
123    }
124
125    /// Returns primary key of the batch.
126    pub fn primary_key(&self) -> &[u8] {
127        &self.primary_key
128    }
129
130    /// Returns possibly decoded primary-key values.
131    pub fn pk_values(&self) -> Option<&CompositeValues> {
132        self.pk_values.as_ref()
133    }
134
135    /// Sets possibly decoded primary-key values.
136    pub fn set_pk_values(&mut self, pk_values: CompositeValues) {
137        self.pk_values = Some(pk_values);
138    }
139
140    /// Removes possibly decoded primary-key values. For testing only.
141    #[cfg(any(test, feature = "test"))]
142    pub fn remove_pk_values(&mut self) {
143        self.pk_values = None;
144    }
145
146    /// Returns fields in the batch.
147    pub fn fields(&self) -> &[BatchColumn] {
148        &self.fields
149    }
150
151    /// Returns timestamps of the batch.
152    pub fn timestamps(&self) -> &VectorRef {
153        &self.timestamps
154    }
155
156    /// Returns sequences of the batch.
157    pub fn sequences(&self) -> &Arc<UInt64Vector> {
158        &self.sequences
159    }
160
161    /// Returns op types of the batch.
162    pub fn op_types(&self) -> &Arc<UInt8Vector> {
163        &self.op_types
164    }
165
166    /// Returns the number of rows in the batch.
167    pub fn num_rows(&self) -> usize {
168        // All vectors have the same length. We use the length of sequences vector
169        // since it has static type.
170        self.sequences.len()
171    }
172
173    /// Create an empty [`Batch`].
174    pub(crate) fn empty() -> Self {
175        Self {
176            primary_key: vec![],
177            pk_values: None,
178            timestamps: Arc::new(TimestampMillisecondVectorBuilder::with_capacity(0).finish()),
179            sequences: Arc::new(UInt64VectorBuilder::with_capacity(0).finish()),
180            op_types: Arc::new(UInt8VectorBuilder::with_capacity(0).finish()),
181            fields: vec![],
182            fields_idx: None,
183        }
184    }
185
186    /// Returns true if the number of rows in the batch is 0.
187    pub fn is_empty(&self) -> bool {
188        self.num_rows() == 0
189    }
190
191    /// Returns the first timestamp in the batch or `None` if the batch is empty.
192    pub fn first_timestamp(&self) -> Option<Timestamp> {
193        if self.timestamps.is_empty() {
194            return None;
195        }
196
197        Some(self.get_timestamp(0))
198    }
199
200    /// Returns the last timestamp in the batch or `None` if the batch is empty.
201    pub fn last_timestamp(&self) -> Option<Timestamp> {
202        if self.timestamps.is_empty() {
203            return None;
204        }
205
206        Some(self.get_timestamp(self.timestamps.len() - 1))
207    }
208
209    /// Returns the first sequence in the batch or `None` if the batch is empty.
210    pub fn first_sequence(&self) -> Option<SequenceNumber> {
211        if self.sequences.is_empty() {
212            return None;
213        }
214
215        Some(self.get_sequence(0))
216    }
217
218    /// Returns the last sequence in the batch or `None` if the batch is empty.
219    pub fn last_sequence(&self) -> Option<SequenceNumber> {
220        if self.sequences.is_empty() {
221            return None;
222        }
223
224        Some(self.get_sequence(self.sequences.len() - 1))
225    }
226
227    /// Replaces the primary key of the batch.
228    ///
229    /// Notice that this [Batch] also contains a maybe-exist `pk_values`.
230    /// Be sure to update that field as well.
231    pub fn set_primary_key(&mut self, primary_key: Vec<u8>) {
232        self.primary_key = primary_key;
233    }
234
235    /// Slice the batch, returning a new batch.
236    ///
237    /// # Panics
238    /// Panics if `offset + length > self.num_rows()`.
239    pub fn slice(&self, offset: usize, length: usize) -> Batch {
240        let fields = self
241            .fields
242            .iter()
243            .map(|column| BatchColumn {
244                column_id: column.column_id,
245                data: column.data.slice(offset, length),
246            })
247            .collect();
248        // We skip using the builder to avoid validating the batch again.
249        Batch {
250            // Now we need to clone the primary key. We could try `Bytes` if
251            // this becomes a bottleneck.
252            primary_key: self.primary_key.clone(),
253            pk_values: self.pk_values.clone(),
254            timestamps: self.timestamps.slice(offset, length),
255            sequences: Arc::new(self.sequences.get_slice(offset, length)),
256            op_types: Arc::new(self.op_types.get_slice(offset, length)),
257            fields,
258            fields_idx: self.fields_idx.clone(),
259        }
260    }
261
262    /// Takes `batches` and concat them into one batch.
263    ///
264    /// All `batches` must have the same primary key.
265    pub fn concat(mut batches: Vec<Batch>) -> Result<Batch> {
266        ensure!(
267            !batches.is_empty(),
268            InvalidBatchSnafu {
269                reason: "empty batches",
270            }
271        );
272        if batches.len() == 1 {
273            // Now we own the `batches` so we could pop it directly.
274            return Ok(batches.pop().unwrap());
275        }
276
277        let primary_key = std::mem::take(&mut batches[0].primary_key);
278        let first = &batches[0];
279        // We took the primary key from the first batch so we don't use `first.primary_key()`.
280        ensure!(
281            batches
282                .iter()
283                .skip(1)
284                .all(|b| b.primary_key() == primary_key),
285            InvalidBatchSnafu {
286                reason: "batches have different primary key",
287            }
288        );
289        for b in batches.iter().skip(1) {
290            ensure!(
291                b.fields.len() == first.fields.len(),
292                InvalidBatchSnafu {
293                    reason: "batches have different field num",
294                }
295            );
296            for (l, r) in b.fields.iter().zip(&first.fields) {
297                ensure!(
298                    l.column_id == r.column_id,
299                    InvalidBatchSnafu {
300                        reason: "batches have different fields",
301                    }
302                );
303            }
304        }
305
306        // We take the primary key from the first batch.
307        let mut builder = BatchBuilder::new(primary_key);
308        // Concat timestamps, sequences, op_types, fields.
309        let array = concat_arrays(batches.iter().map(|b| b.timestamps().to_arrow_array()))?;
310        builder.timestamps_array(array)?;
311        let array = concat_arrays(batches.iter().map(|b| b.sequences().to_arrow_array()))?;
312        builder.sequences_array(array)?;
313        let array = concat_arrays(batches.iter().map(|b| b.op_types().to_arrow_array()))?;
314        builder.op_types_array(array)?;
315        for (i, batch_column) in first.fields.iter().enumerate() {
316            let array = concat_arrays(batches.iter().map(|b| b.fields()[i].data.to_arrow_array()))?;
317            builder.push_field_array(batch_column.column_id, array)?;
318        }
319
320        builder.build()
321    }
322
323    /// Removes rows whose op type is delete.
324    pub fn filter_deleted(&mut self) -> Result<()> {
325        // Safety: op type column is not null.
326        let array = self.op_types.as_arrow();
327        // Find rows with non-delete op type.
328        let rhs = UInt8Array::new_scalar(OpType::Delete as u8);
329        let predicate =
330            arrow::compute::kernels::cmp::neq(array, &rhs).context(ComputeArrowSnafu)?;
331        self.filter(&BooleanVector::from(predicate))
332    }
333
334    // Applies the `predicate` to the batch.
335    // Safety: We know the array type so we unwrap on casting.
336    pub fn filter(&mut self, predicate: &BooleanVector) -> Result<()> {
337        self.timestamps = self
338            .timestamps
339            .filter(predicate)
340            .context(ComputeVectorSnafu)?;
341        self.sequences = Arc::new(
342            UInt64Vector::try_from_arrow_array(
343                arrow::compute::filter(self.sequences.as_arrow(), predicate.as_boolean_array())
344                    .context(ComputeArrowSnafu)?,
345            )
346            .unwrap(),
347        );
348        self.op_types = Arc::new(
349            UInt8Vector::try_from_arrow_array(
350                arrow::compute::filter(self.op_types.as_arrow(), predicate.as_boolean_array())
351                    .context(ComputeArrowSnafu)?,
352            )
353            .unwrap(),
354        );
355        for batch_column in &mut self.fields {
356            batch_column.data = batch_column
357                .data
358                .filter(predicate)
359                .context(ComputeVectorSnafu)?;
360        }
361
362        Ok(())
363    }
364
365    /// Filters rows by the given `sequence`. Only preserves rows with sequence less than or equal to `sequence`.
366    pub fn filter_by_sequence(&mut self, sequence: Option<SequenceRange>) -> Result<()> {
367        let seq_range = match sequence {
368            None => return Ok(()),
369            Some(seq_range) => {
370                let (Some(first), Some(last)) = (self.first_sequence(), self.last_sequence())
371                else {
372                    return Ok(());
373                };
374                let is_subset = match seq_range {
375                    SequenceRange::Gt { min } => min < first,
376                    SequenceRange::LtEq { max } => max >= last,
377                    SequenceRange::GtLtEq { min, max } => min < first && max >= last,
378                };
379                if is_subset {
380                    return Ok(());
381                }
382                seq_range
383            }
384        };
385
386        let seqs = self.sequences.as_arrow();
387        let predicate = seq_range.filter(seqs).context(ComputeArrowSnafu)?;
388
389        let predicate = BooleanVector::from(predicate);
390        self.filter(&predicate)?;
391
392        Ok(())
393    }
394
395    /// Sorts rows in the batch. If `dedup` is true, it also removes
396    /// duplicated rows according to primary keys.
397    ///
398    /// It orders rows by timestamp, sequence desc and only keep the latest
399    /// row for the same timestamp. It doesn't consider op type as sequence
400    /// should already provide uniqueness for a row.
401    pub fn sort(&mut self, dedup: bool) -> Result<()> {
402        // If building a converter each time is costly, we may allow passing a
403        // converter.
404        let converter = RowConverter::new(vec![
405            SortField::new(self.timestamps.data_type().as_arrow_type()),
406            SortField::new_with_options(
407                self.sequences.data_type().as_arrow_type(),
408                SortOptions {
409                    descending: true,
410                    ..Default::default()
411                },
412            ),
413        ])
414        .context(ComputeArrowSnafu)?;
415        // Columns to sort.
416        let columns = [
417            self.timestamps.to_arrow_array(),
418            self.sequences.to_arrow_array(),
419        ];
420        let rows = converter.convert_columns(&columns).unwrap();
421        let mut to_sort: Vec<_> = rows.iter().enumerate().collect();
422
423        let was_sorted = to_sort.is_sorted_by_key(|x| x.1);
424        if !was_sorted {
425            to_sort.sort_unstable_by_key(|x| x.1);
426        }
427
428        let num_rows = to_sort.len();
429        if dedup {
430            // Dedup by timestamps.
431            to_sort.dedup_by(|left, right| {
432                debug_assert_eq!(18, left.1.as_ref().len());
433                debug_assert_eq!(18, right.1.as_ref().len());
434                let (left_key, right_key) = (left.1.as_ref(), right.1.as_ref());
435                // We only compare the timestamp part and ignore sequence.
436                left_key[..TIMESTAMP_KEY_LEN] == right_key[..TIMESTAMP_KEY_LEN]
437            });
438        }
439        let no_dedup = to_sort.len() == num_rows;
440
441        if was_sorted && no_dedup {
442            return Ok(());
443        }
444        let indices = UInt32Vector::from_iter_values(to_sort.iter().map(|v| v.0 as u32));
445        self.take_in_place(&indices)
446    }
447
448    /// Merges duplicated timestamps in the batch by keeping the latest non-null field values.
449    ///
450    /// Rows must already be sorted by timestamp (ascending) and sequence (descending).
451    ///
452    /// This method deduplicates rows with the same timestamp (keeping the first row in each
453    /// timestamp range as the base row) and fills null fields from subsequent rows until all
454    /// fields are filled or a delete operation is encountered.
455    pub(crate) fn merge_last_non_null(&mut self) -> Result<()> {
456        let num_rows = self.num_rows();
457        if num_rows < 2 {
458            return Ok(());
459        }
460
461        let Some(timestamps) = self.timestamps_native() else {
462            return Ok(());
463        };
464
465        // Fast path: check if there are any duplicate timestamps.
466        let mut has_dup = false;
467        let mut group_count = 1;
468        for i in 1..num_rows {
469            has_dup |= timestamps[i] == timestamps[i - 1];
470            group_count += (timestamps[i] != timestamps[i - 1]) as usize;
471        }
472        if !has_dup {
473            return Ok(());
474        }
475
476        let num_fields = self.fields.len();
477        let op_types = self.op_types.as_arrow().values();
478
479        let mut base_indices: Vec<u32> = Vec::with_capacity(group_count);
480        let mut field_indices: Vec<Vec<u32>> = (0..num_fields)
481            .map(|_| Vec::with_capacity(group_count))
482            .collect();
483
484        let mut start = 0;
485        while start < num_rows {
486            let ts = timestamps[start];
487            let mut end = start + 1;
488            while end < num_rows && timestamps[end] == ts {
489                end += 1;
490            }
491
492            let group_pos = base_indices.len();
493            base_indices.push(start as u32);
494
495            if num_fields > 0 {
496                // Default: take the base row for all fields.
497                for idx in &mut field_indices {
498                    idx.push(start as u32);
499                }
500
501                let base_deleted = op_types[start] == OpType::Delete as u8;
502                if !base_deleted {
503                    // Track fields that are null in the base row and try to fill them from older
504                    // rows in the same timestamp range.
505                    let mut missing_fields = Vec::new();
506                    for (field_idx, col) in self.fields.iter().enumerate() {
507                        if col.data.is_null(start) {
508                            missing_fields.push(field_idx);
509                        }
510                    }
511
512                    if !missing_fields.is_empty() {
513                        for row_idx in (start + 1)..end {
514                            if op_types[row_idx] == OpType::Delete as u8 {
515                                break;
516                            }
517
518                            missing_fields.retain(|&field_idx| {
519                                if self.fields[field_idx].data.is_null(row_idx) {
520                                    true
521                                } else {
522                                    field_indices[field_idx][group_pos] = row_idx as u32;
523                                    false
524                                }
525                            });
526
527                            if missing_fields.is_empty() {
528                                break;
529                            }
530                        }
531                    }
532                }
533            }
534
535            start = end;
536        }
537
538        let base_indices = UInt32Vector::from_vec(base_indices);
539        self.timestamps = self
540            .timestamps
541            .take(&base_indices)
542            .context(ComputeVectorSnafu)?;
543        let array = arrow::compute::take(self.sequences.as_arrow(), base_indices.as_arrow(), None)
544            .context(ComputeArrowSnafu)?;
545        // Safety: We know the array and vector type.
546        self.sequences = Arc::new(UInt64Vector::try_from_arrow_array(array).unwrap());
547        let array = arrow::compute::take(self.op_types.as_arrow(), base_indices.as_arrow(), None)
548            .context(ComputeArrowSnafu)?;
549        // Safety: We know the array and vector type.
550        self.op_types = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
551
552        for (field_idx, batch_column) in self.fields.iter_mut().enumerate() {
553            let idx = UInt32Vector::from_vec(std::mem::take(&mut field_indices[field_idx]));
554            batch_column.data = batch_column.data.take(&idx).context(ComputeVectorSnafu)?;
555        }
556
557        Ok(())
558    }
559
560    /// Returns the estimated memory size of the batch.
561    pub fn memory_size(&self) -> usize {
562        let mut size = std::mem::size_of::<Self>();
563        size += self.primary_key.len();
564        size += self.timestamps.memory_size();
565        size += self.sequences.memory_size();
566        size += self.op_types.memory_size();
567        for batch_column in &self.fields {
568            size += batch_column.data.memory_size();
569        }
570        size
571    }
572
573    /// Returns ids and datatypes of fields in the [Batch] after applying the `projection`.
574    pub(crate) fn projected_fields(
575        metadata: &RegionMetadata,
576        projection: &[ColumnId],
577    ) -> Vec<(ColumnId, ConcreteDataType)> {
578        let projected_ids: HashSet<_> = projection.iter().copied().collect();
579        metadata
580            .field_columns()
581            .filter_map(|column| {
582                if projected_ids.contains(&column.column_id) {
583                    Some((column.column_id, column.column_schema.data_type.clone()))
584                } else {
585                    None
586                }
587            })
588            .collect()
589    }
590
591    /// Returns timestamps in a native slice or `None` if the batch is empty.
592    pub(crate) fn timestamps_native(&self) -> Option<&[i64]> {
593        if self.timestamps.is_empty() {
594            return None;
595        }
596
597        let values = match self.timestamps.data_type() {
598            ConcreteDataType::Timestamp(TimestampType::Second(_)) => self
599                .timestamps
600                .as_any()
601                .downcast_ref::<TimestampSecondVector>()
602                .unwrap()
603                .as_arrow()
604                .values(),
605            ConcreteDataType::Timestamp(TimestampType::Millisecond(_)) => self
606                .timestamps
607                .as_any()
608                .downcast_ref::<TimestampMillisecondVector>()
609                .unwrap()
610                .as_arrow()
611                .values(),
612            ConcreteDataType::Timestamp(TimestampType::Microsecond(_)) => self
613                .timestamps
614                .as_any()
615                .downcast_ref::<TimestampMicrosecondVector>()
616                .unwrap()
617                .as_arrow()
618                .values(),
619            ConcreteDataType::Timestamp(TimestampType::Nanosecond(_)) => self
620                .timestamps
621                .as_any()
622                .downcast_ref::<TimestampNanosecondVector>()
623                .unwrap()
624                .as_arrow()
625                .values(),
626            other => panic!("timestamps in a Batch has other type {:?}", other),
627        };
628
629        Some(values)
630    }
631
632    /// Takes the batch in place.
633    fn take_in_place(&mut self, indices: &UInt32Vector) -> Result<()> {
634        self.timestamps = self.timestamps.take(indices).context(ComputeVectorSnafu)?;
635        let array = arrow::compute::take(self.sequences.as_arrow(), indices.as_arrow(), None)
636            .context(ComputeArrowSnafu)?;
637        // Safety: we know the array and vector type.
638        self.sequences = Arc::new(UInt64Vector::try_from_arrow_array(array).unwrap());
639        let array = arrow::compute::take(self.op_types.as_arrow(), indices.as_arrow(), None)
640            .context(ComputeArrowSnafu)?;
641        self.op_types = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
642        for batch_column in &mut self.fields {
643            batch_column.data = batch_column
644                .data
645                .take(indices)
646                .context(ComputeVectorSnafu)?;
647        }
648
649        Ok(())
650    }
651
652    /// Gets a timestamp at given `index`.
653    ///
654    /// # Panics
655    /// Panics if `index` is out-of-bound or the timestamp vector returns null.
656    fn get_timestamp(&self, index: usize) -> Timestamp {
657        match self.timestamps.get_ref(index) {
658            ValueRef::Timestamp(timestamp) => timestamp,
659
660            // We have check the data type is timestamp compatible in the [BatchBuilder] so it's safe to panic.
661            value => panic!("{:?} is not a timestamp", value),
662        }
663    }
664
665    /// Gets a sequence at given `index`.
666    ///
667    /// # Panics
668    /// Panics if `index` is out-of-bound or the sequence vector returns null.
669    pub(crate) fn get_sequence(&self, index: usize) -> SequenceNumber {
670        // Safety: sequences is not null so it actually returns Some.
671        self.sequences.get_data(index).unwrap()
672    }
673
674    /// Checks the batch is monotonic by timestamps.
675    #[cfg(debug_assertions)]
676    pub(crate) fn check_monotonic(&self) -> Result<(), String> {
677        use std::cmp::Ordering;
678        if self.timestamps_native().is_none() {
679            return Ok(());
680        }
681
682        let timestamps = self.timestamps_native().unwrap();
683        let sequences = self.sequences.as_arrow().values();
684        for (i, window) in timestamps.windows(2).enumerate() {
685            let current = window[0];
686            let next = window[1];
687            let current_sequence = sequences[i];
688            let next_sequence = sequences[i + 1];
689            match current.cmp(&next) {
690                Ordering::Less => {
691                    // The current timestamp is less than the next timestamp.
692                    continue;
693                }
694                Ordering::Equal => {
695                    // The current timestamp is equal to the next timestamp.
696                    if current_sequence < next_sequence {
697                        return Err(format!(
698                            "sequence are not monotonic: ts {} == {} but current sequence {} < {}, index: {}",
699                            current, next, current_sequence, next_sequence, i
700                        ));
701                    }
702                }
703                Ordering::Greater => {
704                    // The current timestamp is greater than the next timestamp.
705                    return Err(format!(
706                        "timestamps are not monotonic: {} > {}, index: {}",
707                        current, next, i
708                    ));
709                }
710            }
711        }
712
713        Ok(())
714    }
715
716    /// Returns Ok if the given batch is behind the current batch.
717    #[cfg(debug_assertions)]
718    pub(crate) fn check_next_batch(&self, other: &Batch) -> Result<(), String> {
719        // Checks the primary key
720        if self.primary_key() < other.primary_key() {
721            return Ok(());
722        }
723        if self.primary_key() > other.primary_key() {
724            return Err(format!(
725                "primary key is not monotonic: {:?} > {:?}",
726                self.primary_key(),
727                other.primary_key()
728            ));
729        }
730        // Checks the timestamp.
731        if self.last_timestamp() < other.first_timestamp() {
732            return Ok(());
733        }
734        if self.last_timestamp() > other.first_timestamp() {
735            return Err(format!(
736                "timestamps are not monotonic: {:?} > {:?}",
737                self.last_timestamp(),
738                other.first_timestamp()
739            ));
740        }
741        // Checks the sequence.
742        if self.last_sequence() >= other.first_sequence() {
743            return Ok(());
744        }
745        Err(format!(
746            "sequences are not monotonic: {:?} < {:?}",
747            self.last_sequence(),
748            other.first_sequence()
749        ))
750    }
751
752    /// Returns the value of the column in the primary key.
753    ///
754    /// Lazily decodes the primary key and caches the result.
755    pub fn pk_col_value(
756        &mut self,
757        codec: &dyn PrimaryKeyCodec,
758        col_idx_in_pk: usize,
759        column_id: ColumnId,
760    ) -> Result<Option<&Value>> {
761        if self.pk_values.is_none() {
762            self.pk_values = Some(codec.decode(&self.primary_key).context(DecodeSnafu)?);
763        }
764
765        let pk_values = self.pk_values.as_ref().unwrap();
766        Ok(match pk_values {
767            CompositeValues::Dense(values) => values.get(col_idx_in_pk).map(|(_, v)| v),
768            CompositeValues::Sparse(values) => values.get(&column_id),
769        })
770    }
771
772    /// Returns values of the field in the batch.
773    ///
774    /// Lazily caches the field index.
775    pub fn field_col_value(&mut self, column_id: ColumnId) -> Option<&BatchColumn> {
776        if self.fields_idx.is_none() {
777            self.fields_idx = Some(
778                self.fields
779                    .iter()
780                    .enumerate()
781                    .map(|(i, c)| (c.column_id, i))
782                    .collect(),
783            );
784        }
785
786        self.fields_idx
787            .as_ref()
788            .unwrap()
789            .get(&column_id)
790            .map(|&idx| &self.fields[idx])
791    }
792}
793
794/// A struct to check the batch is monotonic.
795#[cfg(debug_assertions)]
796#[derive(Default)]
797pub(crate) struct BatchChecker {
798    last_batch: Option<Batch>,
799    start: Option<Timestamp>,
800    end: Option<Timestamp>,
801}
802
803#[cfg(debug_assertions)]
804impl BatchChecker {
805    /// Attaches the given start timestamp to the checker.
806    pub(crate) fn with_start(mut self, start: Option<Timestamp>) -> Self {
807        self.start = start;
808        self
809    }
810
811    /// Attaches the given end timestamp to the checker.
812    pub(crate) fn with_end(mut self, end: Option<Timestamp>) -> Self {
813        self.end = end;
814        self
815    }
816
817    /// Returns true if the given batch is monotonic and behind
818    /// the last batch.
819    pub(crate) fn check_monotonic(&mut self, batch: &Batch) -> Result<(), String> {
820        batch.check_monotonic()?;
821
822        if let (Some(start), Some(first)) = (self.start, batch.first_timestamp())
823            && start > first
824        {
825            return Err(format!(
826                "batch's first timestamp is before the start timestamp: {:?} > {:?}",
827                start, first
828            ));
829        }
830        if let (Some(end), Some(last)) = (self.end, batch.last_timestamp())
831            && end <= last
832        {
833            return Err(format!(
834                "batch's last timestamp is after the end timestamp: {:?} <= {:?}",
835                end, last
836            ));
837        }
838
839        // Checks the batch is behind the last batch.
840        // Then Updates the last batch.
841        let res = self
842            .last_batch
843            .as_ref()
844            .map(|last| last.check_next_batch(batch))
845            .unwrap_or(Ok(()));
846        self.last_batch = Some(batch.clone());
847        res
848    }
849
850    /// Formats current batch and last batch for debug.
851    pub(crate) fn format_batch(&self, batch: &Batch) -> String {
852        use std::fmt::Write;
853
854        let mut message = String::new();
855        if let Some(last) = &self.last_batch {
856            write!(
857                message,
858                "last_pk: {:?}, last_ts: {:?}, last_seq: {:?}, ",
859                last.primary_key(),
860                last.last_timestamp(),
861                last.last_sequence()
862            )
863            .unwrap();
864        }
865        write!(
866            message,
867            "batch_pk: {:?}, batch_ts: {:?}, batch_seq: {:?}",
868            batch.primary_key(),
869            batch.timestamps(),
870            batch.sequences()
871        )
872        .unwrap();
873
874        message
875    }
876
877    /// Checks batches from the part range are monotonic. Otherwise, panics.
878    pub(crate) fn ensure_part_range_batch(
879        &mut self,
880        scanner: &str,
881        region_id: store_api::storage::RegionId,
882        partition: usize,
883        part_range: store_api::region_engine::PartitionRange,
884        batch: &Batch,
885    ) {
886        if let Err(e) = self.check_monotonic(batch) {
887            let err_msg = format!(
888                "{}: batch is not sorted, {}, region_id: {}, partition: {}, part_range: {:?}",
889                scanner, e, region_id, partition, part_range,
890            );
891            common_telemetry::error!("{err_msg}, {}", self.format_batch(batch));
892            // Only print the number of row in the panic message.
893            panic!("{err_msg}, batch rows: {}", batch.num_rows());
894        }
895    }
896}
897
898/// Len of timestamp in arrow row format.
899const TIMESTAMP_KEY_LEN: usize = 9;
900
901/// Helper function to concat arrays from `iter`.
902fn concat_arrays(iter: impl Iterator<Item = ArrayRef>) -> Result<ArrayRef> {
903    let arrays: Vec<_> = iter.collect();
904    let dyn_arrays: Vec<_> = arrays.iter().map(|array| array.as_ref()).collect();
905    arrow::compute::concat(&dyn_arrays).context(ComputeArrowSnafu)
906}
907
908/// A column in a [Batch].
909#[derive(Debug, PartialEq, Eq, Clone)]
910pub struct BatchColumn {
911    /// Id of the column.
912    pub column_id: ColumnId,
913    /// Data of the column.
914    pub data: VectorRef,
915}
916
917/// Builder to build [Batch].
918pub struct BatchBuilder {
919    primary_key: Vec<u8>,
920    timestamps: Option<VectorRef>,
921    sequences: Option<Arc<UInt64Vector>>,
922    op_types: Option<Arc<UInt8Vector>>,
923    fields: Vec<BatchColumn>,
924}
925
926impl BatchBuilder {
927    /// Creates a new [BatchBuilder] with primary key.
928    pub fn new(primary_key: Vec<u8>) -> BatchBuilder {
929        BatchBuilder {
930            primary_key,
931            timestamps: None,
932            sequences: None,
933            op_types: None,
934            fields: Vec::new(),
935        }
936    }
937
938    /// Creates a new [BatchBuilder] with all required columns.
939    pub fn with_required_columns(
940        primary_key: Vec<u8>,
941        timestamps: VectorRef,
942        sequences: Arc<UInt64Vector>,
943        op_types: Arc<UInt8Vector>,
944    ) -> BatchBuilder {
945        BatchBuilder {
946            primary_key,
947            timestamps: Some(timestamps),
948            sequences: Some(sequences),
949            op_types: Some(op_types),
950            fields: Vec::new(),
951        }
952    }
953
954    /// Set all field columns.
955    pub fn with_fields(mut self, fields: Vec<BatchColumn>) -> Self {
956        self.fields = fields;
957        self
958    }
959
960    /// Push a field column.
961    pub fn push_field(&mut self, column: BatchColumn) -> &mut Self {
962        self.fields.push(column);
963        self
964    }
965
966    /// Push an array as a field.
967    pub fn push_field_array(&mut self, column_id: ColumnId, array: ArrayRef) -> Result<&mut Self> {
968        let vector = Helper::try_into_vector(array).context(ConvertVectorSnafu)?;
969        self.fields.push(BatchColumn {
970            column_id,
971            data: vector,
972        });
973
974        Ok(self)
975    }
976
977    /// Try to set an array as timestamps.
978    pub fn timestamps_array(&mut self, array: ArrayRef) -> Result<&mut Self> {
979        let vector = Helper::try_into_vector(array).context(ConvertVectorSnafu)?;
980        ensure!(
981            vector.data_type().is_timestamp(),
982            InvalidBatchSnafu {
983                reason: format!("{:?} is not a timestamp type", vector.data_type()),
984            }
985        );
986
987        self.timestamps = Some(vector);
988        Ok(self)
989    }
990
991    /// Try to set an array as sequences.
992    pub fn sequences_array(&mut self, array: ArrayRef) -> Result<&mut Self> {
993        ensure!(
994            *array.data_type() == arrow::datatypes::DataType::UInt64,
995            InvalidBatchSnafu {
996                reason: "sequence array is not UInt64 type",
997            }
998        );
999        // Safety: The cast must success as we have ensured it is uint64 type.
1000        let vector = Arc::new(UInt64Vector::try_from_arrow_array(array).unwrap());
1001        self.sequences = Some(vector);
1002
1003        Ok(self)
1004    }
1005
1006    /// Try to set an array as op types.
1007    pub fn op_types_array(&mut self, array: ArrayRef) -> Result<&mut Self> {
1008        ensure!(
1009            *array.data_type() == arrow::datatypes::DataType::UInt8,
1010            InvalidBatchSnafu {
1011                reason: "sequence array is not UInt8 type",
1012            }
1013        );
1014        // Safety: The cast must success as we have ensured it is uint64 type.
1015        let vector = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
1016        self.op_types = Some(vector);
1017
1018        Ok(self)
1019    }
1020
1021    /// Builds the [Batch].
1022    pub fn build(self) -> Result<Batch> {
1023        let timestamps = self.timestamps.context(InvalidBatchSnafu {
1024            reason: "missing timestamps",
1025        })?;
1026        let sequences = self.sequences.context(InvalidBatchSnafu {
1027            reason: "missing sequences",
1028        })?;
1029        let op_types = self.op_types.context(InvalidBatchSnafu {
1030            reason: "missing op_types",
1031        })?;
1032        // Our storage format ensure these columns are not nullable so
1033        // we use assert here.
1034        assert_eq!(0, timestamps.null_count());
1035        assert_eq!(0, sequences.null_count());
1036        assert_eq!(0, op_types.null_count());
1037
1038        let ts_len = timestamps.len();
1039        ensure!(
1040            sequences.len() == ts_len,
1041            InvalidBatchSnafu {
1042                reason: format!(
1043                    "sequence have different len {} != {}",
1044                    sequences.len(),
1045                    ts_len
1046                ),
1047            }
1048        );
1049        ensure!(
1050            op_types.len() == ts_len,
1051            InvalidBatchSnafu {
1052                reason: format!(
1053                    "op type have different len {} != {}",
1054                    op_types.len(),
1055                    ts_len
1056                ),
1057            }
1058        );
1059        for column in &self.fields {
1060            ensure!(
1061                column.data.len() == ts_len,
1062                InvalidBatchSnafu {
1063                    reason: format!(
1064                        "column {} has different len {} != {}",
1065                        column.column_id,
1066                        column.data.len(),
1067                        ts_len
1068                    ),
1069                }
1070            );
1071        }
1072
1073        Ok(Batch {
1074            primary_key: self.primary_key,
1075            pk_values: None,
1076            timestamps,
1077            sequences,
1078            op_types,
1079            fields: self.fields,
1080            fields_idx: None,
1081        })
1082    }
1083}
1084
1085impl From<Batch> for BatchBuilder {
1086    fn from(batch: Batch) -> Self {
1087        Self {
1088            primary_key: batch.primary_key,
1089            timestamps: Some(batch.timestamps),
1090            sequences: Some(batch.sequences),
1091            op_types: Some(batch.op_types),
1092            fields: batch.fields,
1093        }
1094    }
1095}
1096
1097/// Async [Batch] reader and iterator wrapper.
1098///
1099/// This is the data source for SST writers or internal readers.
1100pub enum Source {
1101    /// Source from a [BoxedBatchReader].
1102    Reader(BoxedBatchReader),
1103    /// Source from a [BoxedBatchIterator].
1104    Iter(BoxedBatchIterator),
1105    /// Source from a [BoxedBatchStream].
1106    Stream(BoxedBatchStream),
1107    /// Source from a [PruneReader].
1108    PruneReader(PruneReader),
1109}
1110
1111impl Source {
1112    /// Returns next [Batch] from this data source.
1113    pub async fn next_batch(&mut self) -> Result<Option<Batch>> {
1114        match self {
1115            Source::Reader(reader) => reader.next_batch().await,
1116            Source::Iter(iter) => iter.next().transpose(),
1117            Source::Stream(stream) => stream.try_next().await,
1118            Source::PruneReader(reader) => reader.next_batch().await,
1119        }
1120    }
1121}
1122
1123/// Async [RecordBatch] reader and iterator wrapper for flat format.
1124pub enum FlatSource {
1125    /// Source from a [BoxedRecordBatchIterator].
1126    Iter(BoxedRecordBatchIterator),
1127    /// Source from a [BoxedRecordBatchStream].
1128    Stream(BoxedRecordBatchStream),
1129}
1130
1131impl FlatSource {
1132    /// Returns next [RecordBatch] from this data source.
1133    pub async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
1134        match self {
1135            FlatSource::Iter(iter) => iter.next().transpose(),
1136            FlatSource::Stream(stream) => stream.try_next().await,
1137        }
1138    }
1139}
1140
1141/// Async batch reader.
1142///
1143/// The reader must guarantee [Batch]es returned by it have the same schema.
1144#[async_trait]
1145pub trait BatchReader: Send {
1146    /// Fetch next [Batch].
1147    ///
1148    /// Returns `Ok(None)` when the reader has reached its end and calling `next_batch()`
1149    /// again won't return batch again.
1150    ///
1151    /// If `Err` is returned, caller should not call this method again, the implementor
1152    /// may or may not panic in such case.
1153    async fn next_batch(&mut self) -> Result<Option<Batch>>;
1154}
1155
1156/// Pointer to [BatchReader].
1157pub type BoxedBatchReader = Box<dyn BatchReader>;
1158
1159/// Pointer to a stream that yields [Batch].
1160pub type BoxedBatchStream = BoxStream<'static, Result<Batch>>;
1161
1162/// Pointer to a stream that yields [RecordBatch].
1163pub type BoxedRecordBatchStream = BoxStream<'static, Result<RecordBatch>>;
1164
1165#[async_trait::async_trait]
1166impl<T: BatchReader + ?Sized> BatchReader for Box<T> {
1167    async fn next_batch(&mut self) -> Result<Option<Batch>> {
1168        (**self).next_batch().await
1169    }
1170}
1171
1172/// Local metrics for scanners.
1173#[derive(Debug, Default)]
1174pub(crate) struct ScannerMetrics {
1175    /// Duration to scan data.
1176    scan_cost: Duration,
1177    /// Duration while waiting for `yield`.
1178    yield_cost: Duration,
1179    /// Number of batches returned.
1180    num_batches: usize,
1181    /// Number of rows returned.
1182    num_rows: usize,
1183}
1184
1185#[cfg(test)]
1186mod tests {
1187    use datatypes::arrow::array::{TimestampMillisecondArray, UInt8Array, UInt64Array};
1188    use mito_codec::row_converter::{self, build_primary_key_codec_with_fields};
1189    use store_api::codec::PrimaryKeyEncoding;
1190    use store_api::storage::consts::ReservedColumnId;
1191
1192    use super::*;
1193    use crate::error::Error;
1194    use crate::test_util::new_batch_builder;
1195
1196    fn new_batch(
1197        timestamps: &[i64],
1198        sequences: &[u64],
1199        op_types: &[OpType],
1200        field: &[u64],
1201    ) -> Batch {
1202        new_batch_builder(b"test", timestamps, sequences, op_types, 1, field)
1203            .build()
1204            .unwrap()
1205    }
1206
1207    fn new_batch_with_u64_fields(
1208        timestamps: &[i64],
1209        sequences: &[u64],
1210        op_types: &[OpType],
1211        fields: &[(ColumnId, &[Option<u64>])],
1212    ) -> Batch {
1213        assert_eq!(timestamps.len(), sequences.len());
1214        assert_eq!(timestamps.len(), op_types.len());
1215        for (_, values) in fields {
1216            assert_eq!(timestamps.len(), values.len());
1217        }
1218
1219        let mut builder = BatchBuilder::new(b"test".to_vec());
1220        builder
1221            .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values(
1222                timestamps.iter().copied(),
1223            )))
1224            .unwrap()
1225            .sequences_array(Arc::new(UInt64Array::from_iter_values(
1226                sequences.iter().copied(),
1227            )))
1228            .unwrap()
1229            .op_types_array(Arc::new(UInt8Array::from_iter_values(
1230                op_types.iter().map(|v| *v as u8),
1231            )))
1232            .unwrap();
1233
1234        for (col_id, values) in fields {
1235            builder
1236                .push_field_array(*col_id, Arc::new(UInt64Array::from(values.to_vec())))
1237                .unwrap();
1238        }
1239
1240        builder.build().unwrap()
1241    }
1242
1243    fn new_batch_without_fields(
1244        timestamps: &[i64],
1245        sequences: &[u64],
1246        op_types: &[OpType],
1247    ) -> Batch {
1248        assert_eq!(timestamps.len(), sequences.len());
1249        assert_eq!(timestamps.len(), op_types.len());
1250
1251        let mut builder = BatchBuilder::new(b"test".to_vec());
1252        builder
1253            .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values(
1254                timestamps.iter().copied(),
1255            )))
1256            .unwrap()
1257            .sequences_array(Arc::new(UInt64Array::from_iter_values(
1258                sequences.iter().copied(),
1259            )))
1260            .unwrap()
1261            .op_types_array(Arc::new(UInt8Array::from_iter_values(
1262                op_types.iter().map(|v| *v as u8),
1263            )))
1264            .unwrap();
1265
1266        builder.build().unwrap()
1267    }
1268
1269    #[test]
1270    fn test_empty_batch() {
1271        let batch = Batch::empty();
1272        assert!(batch.is_empty());
1273        assert_eq!(None, batch.first_timestamp());
1274        assert_eq!(None, batch.last_timestamp());
1275        assert_eq!(None, batch.first_sequence());
1276        assert_eq!(None, batch.last_sequence());
1277        assert!(batch.timestamps_native().is_none());
1278    }
1279
1280    #[test]
1281    fn test_first_last_one() {
1282        let batch = new_batch(&[1], &[2], &[OpType::Put], &[4]);
1283        assert_eq!(
1284            Timestamp::new_millisecond(1),
1285            batch.first_timestamp().unwrap()
1286        );
1287        assert_eq!(
1288            Timestamp::new_millisecond(1),
1289            batch.last_timestamp().unwrap()
1290        );
1291        assert_eq!(2, batch.first_sequence().unwrap());
1292        assert_eq!(2, batch.last_sequence().unwrap());
1293    }
1294
1295    #[test]
1296    fn test_first_last_multiple() {
1297        let batch = new_batch(
1298            &[1, 2, 3],
1299            &[11, 12, 13],
1300            &[OpType::Put, OpType::Put, OpType::Put],
1301            &[21, 22, 23],
1302        );
1303        assert_eq!(
1304            Timestamp::new_millisecond(1),
1305            batch.first_timestamp().unwrap()
1306        );
1307        assert_eq!(
1308            Timestamp::new_millisecond(3),
1309            batch.last_timestamp().unwrap()
1310        );
1311        assert_eq!(11, batch.first_sequence().unwrap());
1312        assert_eq!(13, batch.last_sequence().unwrap());
1313    }
1314
1315    #[test]
1316    fn test_slice() {
1317        let batch = new_batch(
1318            &[1, 2, 3, 4],
1319            &[11, 12, 13, 14],
1320            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1321            &[21, 22, 23, 24],
1322        );
1323        let batch = batch.slice(1, 2);
1324        let expect = new_batch(
1325            &[2, 3],
1326            &[12, 13],
1327            &[OpType::Delete, OpType::Put],
1328            &[22, 23],
1329        );
1330        assert_eq!(expect, batch);
1331    }
1332
1333    #[test]
1334    fn test_timestamps_native() {
1335        let batch = new_batch(
1336            &[1, 2, 3, 4],
1337            &[11, 12, 13, 14],
1338            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1339            &[21, 22, 23, 24],
1340        );
1341        assert_eq!(&[1, 2, 3, 4], batch.timestamps_native().unwrap());
1342    }
1343
1344    #[test]
1345    fn test_concat_empty() {
1346        let err = Batch::concat(vec![]).unwrap_err();
1347        assert!(
1348            matches!(err, Error::InvalidBatch { .. }),
1349            "unexpected err: {err}"
1350        );
1351    }
1352
1353    #[test]
1354    fn test_concat_one() {
1355        let batch = new_batch(&[], &[], &[], &[]);
1356        let actual = Batch::concat(vec![batch.clone()]).unwrap();
1357        assert_eq!(batch, actual);
1358
1359        let batch = new_batch(&[1, 2], &[11, 12], &[OpType::Put, OpType::Put], &[21, 22]);
1360        let actual = Batch::concat(vec![batch.clone()]).unwrap();
1361        assert_eq!(batch, actual);
1362    }
1363
1364    #[test]
1365    fn test_concat_multiple() {
1366        let batches = vec![
1367            new_batch(&[1, 2], &[11, 12], &[OpType::Put, OpType::Put], &[21, 22]),
1368            new_batch(
1369                &[3, 4, 5],
1370                &[13, 14, 15],
1371                &[OpType::Put, OpType::Delete, OpType::Put],
1372                &[23, 24, 25],
1373            ),
1374            new_batch(&[], &[], &[], &[]),
1375            new_batch(&[6], &[16], &[OpType::Put], &[26]),
1376        ];
1377        let batch = Batch::concat(batches).unwrap();
1378        let expect = new_batch(
1379            &[1, 2, 3, 4, 5, 6],
1380            &[11, 12, 13, 14, 15, 16],
1381            &[
1382                OpType::Put,
1383                OpType::Put,
1384                OpType::Put,
1385                OpType::Delete,
1386                OpType::Put,
1387                OpType::Put,
1388            ],
1389            &[21, 22, 23, 24, 25, 26],
1390        );
1391        assert_eq!(expect, batch);
1392    }
1393
1394    #[test]
1395    fn test_concat_different() {
1396        let batch1 = new_batch(&[1], &[1], &[OpType::Put], &[1]);
1397        let mut batch2 = new_batch(&[2], &[2], &[OpType::Put], &[2]);
1398        batch2.primary_key = b"hello".to_vec();
1399        let err = Batch::concat(vec![batch1, batch2]).unwrap_err();
1400        assert!(
1401            matches!(err, Error::InvalidBatch { .. }),
1402            "unexpected err: {err}"
1403        );
1404    }
1405
1406    #[test]
1407    fn test_concat_different_fields() {
1408        let batch1 = new_batch(&[1], &[1], &[OpType::Put], &[1]);
1409        let fields = vec![
1410            batch1.fields()[0].clone(),
1411            BatchColumn {
1412                column_id: 2,
1413                data: Arc::new(UInt64Vector::from_slice([2])),
1414            },
1415        ];
1416        // Batch 2 has more fields.
1417        let batch2 = batch1.clone().with_fields(fields).unwrap();
1418        let err = Batch::concat(vec![batch1.clone(), batch2]).unwrap_err();
1419        assert!(
1420            matches!(err, Error::InvalidBatch { .. }),
1421            "unexpected err: {err}"
1422        );
1423
1424        // Batch 2 has different field.
1425        let fields = vec![BatchColumn {
1426            column_id: 2,
1427            data: Arc::new(UInt64Vector::from_slice([2])),
1428        }];
1429        let batch2 = batch1.clone().with_fields(fields).unwrap();
1430        let err = Batch::concat(vec![batch1, batch2]).unwrap_err();
1431        assert!(
1432            matches!(err, Error::InvalidBatch { .. }),
1433            "unexpected err: {err}"
1434        );
1435    }
1436
1437    #[test]
1438    fn test_filter_deleted_empty() {
1439        let mut batch = new_batch(&[], &[], &[], &[]);
1440        batch.filter_deleted().unwrap();
1441        assert!(batch.is_empty());
1442    }
1443
1444    #[test]
1445    fn test_filter_deleted() {
1446        let mut batch = new_batch(
1447            &[1, 2, 3, 4],
1448            &[11, 12, 13, 14],
1449            &[OpType::Delete, OpType::Put, OpType::Delete, OpType::Put],
1450            &[21, 22, 23, 24],
1451        );
1452        batch.filter_deleted().unwrap();
1453        let expect = new_batch(&[2, 4], &[12, 14], &[OpType::Put, OpType::Put], &[22, 24]);
1454        assert_eq!(expect, batch);
1455
1456        let mut batch = new_batch(
1457            &[1, 2, 3, 4],
1458            &[11, 12, 13, 14],
1459            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1460            &[21, 22, 23, 24],
1461        );
1462        let expect = batch.clone();
1463        batch.filter_deleted().unwrap();
1464        assert_eq!(expect, batch);
1465    }
1466
1467    #[test]
1468    fn test_filter_by_sequence() {
1469        // Filters put only.
1470        let mut batch = new_batch(
1471            &[1, 2, 3, 4],
1472            &[11, 12, 13, 14],
1473            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1474            &[21, 22, 23, 24],
1475        );
1476        batch
1477            .filter_by_sequence(Some(SequenceRange::LtEq { max: 13 }))
1478            .unwrap();
1479        let expect = new_batch(
1480            &[1, 2, 3],
1481            &[11, 12, 13],
1482            &[OpType::Put, OpType::Put, OpType::Put],
1483            &[21, 22, 23],
1484        );
1485        assert_eq!(expect, batch);
1486
1487        // Filters to empty.
1488        let mut batch = new_batch(
1489            &[1, 2, 3, 4],
1490            &[11, 12, 13, 14],
1491            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1492            &[21, 22, 23, 24],
1493        );
1494
1495        batch
1496            .filter_by_sequence(Some(SequenceRange::LtEq { max: 10 }))
1497            .unwrap();
1498        assert!(batch.is_empty());
1499
1500        // None filter.
1501        let mut batch = new_batch(
1502            &[1, 2, 3, 4],
1503            &[11, 12, 13, 14],
1504            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1505            &[21, 22, 23, 24],
1506        );
1507        let expect = batch.clone();
1508        batch.filter_by_sequence(None).unwrap();
1509        assert_eq!(expect, batch);
1510
1511        // Filter a empty batch
1512        let mut batch = new_batch(&[], &[], &[], &[]);
1513        batch
1514            .filter_by_sequence(Some(SequenceRange::LtEq { max: 10 }))
1515            .unwrap();
1516        assert!(batch.is_empty());
1517
1518        // Filter a empty batch with None
1519        let mut batch = new_batch(&[], &[], &[], &[]);
1520        batch.filter_by_sequence(None).unwrap();
1521        assert!(batch.is_empty());
1522
1523        // Test From variant - exclusive lower bound
1524        let mut batch = new_batch(
1525            &[1, 2, 3, 4],
1526            &[11, 12, 13, 14],
1527            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1528            &[21, 22, 23, 24],
1529        );
1530        batch
1531            .filter_by_sequence(Some(SequenceRange::Gt { min: 12 }))
1532            .unwrap();
1533        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1534        assert_eq!(expect, batch);
1535
1536        // Test From variant with no matches
1537        let mut batch = new_batch(
1538            &[1, 2, 3, 4],
1539            &[11, 12, 13, 14],
1540            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1541            &[21, 22, 23, 24],
1542        );
1543        batch
1544            .filter_by_sequence(Some(SequenceRange::Gt { min: 20 }))
1545            .unwrap();
1546        assert!(batch.is_empty());
1547
1548        // Test Range variant - exclusive lower bound, inclusive upper bound
1549        let mut batch = new_batch(
1550            &[1, 2, 3, 4, 5],
1551            &[11, 12, 13, 14, 15],
1552            &[
1553                OpType::Put,
1554                OpType::Put,
1555                OpType::Put,
1556                OpType::Put,
1557                OpType::Put,
1558            ],
1559            &[21, 22, 23, 24, 25],
1560        );
1561        batch
1562            .filter_by_sequence(Some(SequenceRange::GtLtEq { min: 12, max: 14 }))
1563            .unwrap();
1564        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1565        assert_eq!(expect, batch);
1566
1567        // Test Range variant with mixed operations
1568        let mut batch = new_batch(
1569            &[1, 2, 3, 4, 5],
1570            &[11, 12, 13, 14, 15],
1571            &[
1572                OpType::Put,
1573                OpType::Delete,
1574                OpType::Put,
1575                OpType::Delete,
1576                OpType::Put,
1577            ],
1578            &[21, 22, 23, 24, 25],
1579        );
1580        batch
1581            .filter_by_sequence(Some(SequenceRange::GtLtEq { min: 11, max: 13 }))
1582            .unwrap();
1583        let expect = new_batch(
1584            &[2, 3],
1585            &[12, 13],
1586            &[OpType::Delete, OpType::Put],
1587            &[22, 23],
1588        );
1589        assert_eq!(expect, batch);
1590
1591        // Test Range variant with no matches
1592        let mut batch = new_batch(
1593            &[1, 2, 3, 4],
1594            &[11, 12, 13, 14],
1595            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1596            &[21, 22, 23, 24],
1597        );
1598        batch
1599            .filter_by_sequence(Some(SequenceRange::GtLtEq { min: 20, max: 25 }))
1600            .unwrap();
1601        assert!(batch.is_empty());
1602    }
1603
1604    #[test]
1605    fn test_merge_last_non_null_no_dup() {
1606        let mut batch = new_batch_with_u64_fields(
1607            &[1, 2],
1608            &[2, 1],
1609            &[OpType::Put, OpType::Put],
1610            &[(1, &[Some(10), None]), (2, &[Some(100), Some(200)])],
1611        );
1612        let expect = batch.clone();
1613        batch.merge_last_non_null().unwrap();
1614        assert_eq!(expect, batch);
1615    }
1616
1617    #[test]
1618    fn test_merge_last_non_null_fill_null_fields() {
1619        // Rows are already sorted by timestamp asc and sequence desc.
1620        let mut batch = new_batch_with_u64_fields(
1621            &[1, 1, 1],
1622            &[3, 2, 1],
1623            &[OpType::Put, OpType::Put, OpType::Put],
1624            &[
1625                (1, &[None, Some(10), Some(11)]),
1626                (2, &[Some(100), Some(200), Some(300)]),
1627            ],
1628        );
1629        batch.merge_last_non_null().unwrap();
1630
1631        // Field 1 is filled from the first older row (seq=2). Field 2 keeps the base value.
1632        // Filled fields must not be overwritten by even older duplicates.
1633        let expect = new_batch_with_u64_fields(
1634            &[1],
1635            &[3],
1636            &[OpType::Put],
1637            &[(1, &[Some(10)]), (2, &[Some(100)])],
1638        );
1639        assert_eq!(expect, batch);
1640    }
1641
1642    #[test]
1643    fn test_merge_last_non_null_stop_at_delete_row() {
1644        // A delete row in older duplicates should stop filling to avoid resurrecting values before
1645        // deletion.
1646        let mut batch = new_batch_with_u64_fields(
1647            &[1, 1, 1],
1648            &[3, 2, 1],
1649            &[OpType::Put, OpType::Delete, OpType::Put],
1650            &[
1651                (1, &[None, Some(10), Some(11)]),
1652                (2, &[Some(100), Some(200), Some(300)]),
1653            ],
1654        );
1655        batch.merge_last_non_null().unwrap();
1656
1657        let expect = new_batch_with_u64_fields(
1658            &[1],
1659            &[3],
1660            &[OpType::Put],
1661            &[(1, &[None]), (2, &[Some(100)])],
1662        );
1663        assert_eq!(expect, batch);
1664    }
1665
1666    #[test]
1667    fn test_merge_last_non_null_base_delete_no_merge() {
1668        let mut batch = new_batch_with_u64_fields(
1669            &[1, 1],
1670            &[3, 2],
1671            &[OpType::Delete, OpType::Put],
1672            &[(1, &[None, Some(10)]), (2, &[None, Some(200)])],
1673        );
1674        batch.merge_last_non_null().unwrap();
1675
1676        // Base row is delete, keep it as is and don't merge fields from older rows.
1677        let expect =
1678            new_batch_with_u64_fields(&[1], &[3], &[OpType::Delete], &[(1, &[None]), (2, &[None])]);
1679        assert_eq!(expect, batch);
1680    }
1681
1682    #[test]
1683    fn test_merge_last_non_null_multiple_timestamp_groups() {
1684        let mut batch = new_batch_with_u64_fields(
1685            &[1, 1, 2, 3, 3],
1686            &[5, 4, 3, 2, 1],
1687            &[
1688                OpType::Put,
1689                OpType::Put,
1690                OpType::Put,
1691                OpType::Put,
1692                OpType::Put,
1693            ],
1694            &[
1695                (1, &[None, Some(10), Some(20), None, Some(30)]),
1696                (2, &[Some(100), Some(110), Some(120), None, Some(130)]),
1697            ],
1698        );
1699        batch.merge_last_non_null().unwrap();
1700
1701        let expect = new_batch_with_u64_fields(
1702            &[1, 2, 3],
1703            &[5, 3, 2],
1704            &[OpType::Put, OpType::Put, OpType::Put],
1705            &[
1706                (1, &[Some(10), Some(20), Some(30)]),
1707                (2, &[Some(100), Some(120), Some(130)]),
1708            ],
1709        );
1710        assert_eq!(expect, batch);
1711    }
1712
1713    #[test]
1714    fn test_merge_last_non_null_no_fields() {
1715        let mut batch = new_batch_without_fields(
1716            &[1, 1, 2],
1717            &[3, 2, 1],
1718            &[OpType::Put, OpType::Put, OpType::Put],
1719        );
1720        batch.merge_last_non_null().unwrap();
1721
1722        let expect = new_batch_without_fields(&[1, 2], &[3, 1], &[OpType::Put, OpType::Put]);
1723        assert_eq!(expect, batch);
1724    }
1725
1726    #[test]
1727    fn test_filter() {
1728        // Filters put only.
1729        let mut batch = new_batch(
1730            &[1, 2, 3, 4],
1731            &[11, 12, 13, 14],
1732            &[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
1733            &[21, 22, 23, 24],
1734        );
1735        let predicate = BooleanVector::from_vec(vec![false, false, true, true]);
1736        batch.filter(&predicate).unwrap();
1737        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1738        assert_eq!(expect, batch);
1739
1740        // Filters deletion.
1741        let mut batch = new_batch(
1742            &[1, 2, 3, 4],
1743            &[11, 12, 13, 14],
1744            &[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
1745            &[21, 22, 23, 24],
1746        );
1747        let predicate = BooleanVector::from_vec(vec![false, false, true, true]);
1748        batch.filter(&predicate).unwrap();
1749        let expect = new_batch(&[3, 4], &[13, 14], &[OpType::Put, OpType::Put], &[23, 24]);
1750        assert_eq!(expect, batch);
1751
1752        // Filters to empty.
1753        let predicate = BooleanVector::from_vec(vec![false, false]);
1754        batch.filter(&predicate).unwrap();
1755        assert!(batch.is_empty());
1756    }
1757
1758    #[test]
1759    fn test_sort_and_dedup() {
1760        let original = new_batch(
1761            &[2, 3, 1, 4, 5, 2],
1762            &[1, 2, 3, 4, 5, 6],
1763            &[
1764                OpType::Put,
1765                OpType::Put,
1766                OpType::Put,
1767                OpType::Put,
1768                OpType::Put,
1769                OpType::Put,
1770            ],
1771            &[21, 22, 23, 24, 25, 26],
1772        );
1773
1774        let mut batch = original.clone();
1775        batch.sort(true).unwrap();
1776        // It should only keep one timestamp 2.
1777        assert_eq!(
1778            new_batch(
1779                &[1, 2, 3, 4, 5],
1780                &[3, 6, 2, 4, 5],
1781                &[
1782                    OpType::Put,
1783                    OpType::Put,
1784                    OpType::Put,
1785                    OpType::Put,
1786                    OpType::Put,
1787                ],
1788                &[23, 26, 22, 24, 25],
1789            ),
1790            batch
1791        );
1792
1793        let mut batch = original.clone();
1794        batch.sort(false).unwrap();
1795
1796        // It should only keep one timestamp 2.
1797        assert_eq!(
1798            new_batch(
1799                &[1, 2, 2, 3, 4, 5],
1800                &[3, 6, 1, 2, 4, 5],
1801                &[
1802                    OpType::Put,
1803                    OpType::Put,
1804                    OpType::Put,
1805                    OpType::Put,
1806                    OpType::Put,
1807                    OpType::Put,
1808                ],
1809                &[23, 26, 21, 22, 24, 25],
1810            ),
1811            batch
1812        );
1813
1814        let original = new_batch(
1815            &[2, 2, 1],
1816            &[1, 6, 1],
1817            &[OpType::Delete, OpType::Put, OpType::Put],
1818            &[21, 22, 23],
1819        );
1820
1821        let mut batch = original.clone();
1822        batch.sort(true).unwrap();
1823        let expect = new_batch(&[1, 2], &[1, 6], &[OpType::Put, OpType::Put], &[23, 22]);
1824        assert_eq!(expect, batch);
1825
1826        let mut batch = original.clone();
1827        batch.sort(false).unwrap();
1828        let expect = new_batch(
1829            &[1, 2, 2],
1830            &[1, 6, 1],
1831            &[OpType::Put, OpType::Put, OpType::Delete],
1832            &[23, 22, 21],
1833        );
1834        assert_eq!(expect, batch);
1835    }
1836
1837    #[test]
1838    fn test_get_value() {
1839        let encodings = [PrimaryKeyEncoding::Dense, PrimaryKeyEncoding::Sparse];
1840
1841        for encoding in encodings {
1842            let codec = build_primary_key_codec_with_fields(
1843                encoding,
1844                [
1845                    (
1846                        ReservedColumnId::table_id(),
1847                        row_converter::SortField::new(ConcreteDataType::uint32_datatype()),
1848                    ),
1849                    (
1850                        ReservedColumnId::tsid(),
1851                        row_converter::SortField::new(ConcreteDataType::uint64_datatype()),
1852                    ),
1853                    (
1854                        100,
1855                        row_converter::SortField::new(ConcreteDataType::string_datatype()),
1856                    ),
1857                    (
1858                        200,
1859                        row_converter::SortField::new(ConcreteDataType::string_datatype()),
1860                    ),
1861                ]
1862                .into_iter(),
1863            );
1864
1865            let values = [
1866                Value::UInt32(1000),
1867                Value::UInt64(2000),
1868                Value::String("abcdefgh".into()),
1869                Value::String("zyxwvu".into()),
1870            ];
1871            let mut buf = vec![];
1872            codec
1873                .encode_values(
1874                    &[
1875                        (ReservedColumnId::table_id(), values[0].clone()),
1876                        (ReservedColumnId::tsid(), values[1].clone()),
1877                        (100, values[2].clone()),
1878                        (200, values[3].clone()),
1879                    ],
1880                    &mut buf,
1881                )
1882                .unwrap();
1883
1884            let field_col_id = 2;
1885            let mut batch = new_batch_builder(
1886                &buf,
1887                &[1, 2, 3],
1888                &[1, 1, 1],
1889                &[OpType::Put, OpType::Put, OpType::Put],
1890                field_col_id,
1891                &[42, 43, 44],
1892            )
1893            .build()
1894            .unwrap();
1895
1896            let v = batch
1897                .pk_col_value(&*codec, 0, ReservedColumnId::table_id())
1898                .unwrap()
1899                .unwrap();
1900            assert_eq!(values[0], *v);
1901
1902            let v = batch
1903                .pk_col_value(&*codec, 1, ReservedColumnId::tsid())
1904                .unwrap()
1905                .unwrap();
1906            assert_eq!(values[1], *v);
1907
1908            let v = batch.pk_col_value(&*codec, 2, 100).unwrap().unwrap();
1909            assert_eq!(values[2], *v);
1910
1911            let v = batch.pk_col_value(&*codec, 3, 200).unwrap().unwrap();
1912            assert_eq!(values[3], *v);
1913
1914            let v = batch.field_col_value(field_col_id).unwrap();
1915            assert_eq!(v.data.get(0), Value::UInt64(42));
1916            assert_eq!(v.data.get(1), Value::UInt64(43));
1917            assert_eq!(v.data.get(2), Value::UInt64(44));
1918        }
1919    }
1920}