Skip to main content

mito2/read/
range_cache.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Utilities for the partition range scan result cache.
16
17use std::mem;
18use std::sync::Arc;
19
20use async_stream::try_stream;
21use common_telemetry::warn;
22use common_time::Timestamp;
23use common_time::range::TimestampRange;
24use common_time::timestamp::TimeUnit;
25use datafusion_expr::expr::Expr;
26use datafusion_expr::{Between, BinaryExpr, Operator};
27use datatypes::arrow::compute::concat_batches;
28use datatypes::arrow::record_batch::RecordBatch;
29use datatypes::prelude::ConcreteDataType;
30use datatypes::value::scalar_value_to_timestamp;
31use futures::TryStreamExt;
32use snafu::ResultExt;
33use store_api::region_engine::PartitionRange;
34use store_api::storage::{FileId, RegionId, TimeSeriesRowSelector};
35use table::predicate::is_string_timestamp_literal;
36use tokio::sync::{mpsc, oneshot};
37
38use crate::cache::CacheStrategy;
39use crate::error::{ComputeArrowSnafu, Result};
40use crate::read::BoxedRecordBatchStream;
41use crate::read::read_columns::ReadColumns;
42use crate::read::scan_region::StreamContext;
43use crate::read::scan_util::PartitionMetrics;
44use crate::region::options::MergeMode;
45use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE;
46
47const RANGE_CACHE_COMPACT_THRESHOLD_BYTES: usize = 8 * 1024 * 1024;
48
49/// Fingerprint of the scan request fields that affect partition range cache reuse.
50///
51/// It records a normalized view of the projected columns and filters, plus
52/// scan options that can change the returned rows. Schema-dependent metadata
53/// and the partition expression version are included so cached results are not
54/// reused across incompatible schema or partitioning changes.
55#[derive(Debug, Clone, PartialEq, Eq, Hash)]
56pub(crate) struct ScanRequestFingerprint {
57    /// Projection and filters without the time index and partition exprs.
58    inner: Arc<SharedScanRequestFingerprint>,
59    /// Filters with the time index column.
60    time_filters: Option<Arc<Vec<String>>>,
61    series_row_selector: Option<TimeSeriesRowSelector>,
62    append_mode: bool,
63    filter_deleted: bool,
64    merge_mode: MergeMode,
65    /// We keep the partition expr version to ensure we won't reuse the fingerprint after we change the partition expr.
66    /// We store the version instead of the whole partition expr or partition expr filters.
67    partition_expr_version: u64,
68}
69
70#[derive(Debug)]
71pub(crate) struct ScanRequestFingerprintBuilder {
72    pub(crate) read_columns: ReadColumns,
73    pub(crate) read_column_types: Vec<Option<ConcreteDataType>>,
74    pub(crate) filters: Vec<String>,
75    pub(crate) time_filters: Vec<String>,
76    pub(crate) series_row_selector: Option<TimeSeriesRowSelector>,
77    pub(crate) append_mode: bool,
78    pub(crate) filter_deleted: bool,
79    pub(crate) merge_mode: MergeMode,
80    pub(crate) partition_expr_version: u64,
81}
82
83impl ScanRequestFingerprintBuilder {
84    pub(crate) fn build(self) -> ScanRequestFingerprint {
85        let Self {
86            read_columns,
87            read_column_types,
88            filters,
89            time_filters,
90            series_row_selector,
91            append_mode,
92            filter_deleted,
93            merge_mode,
94            partition_expr_version,
95        } = self;
96
97        ScanRequestFingerprint {
98            inner: Arc::new(SharedScanRequestFingerprint {
99                read_columns,
100                read_column_types,
101                filters,
102            }),
103            time_filters: (!time_filters.is_empty()).then(|| Arc::new(time_filters)),
104            series_row_selector,
105            append_mode,
106            filter_deleted,
107            merge_mode,
108            partition_expr_version,
109        }
110    }
111}
112
113/// Non-copiable struct of the fingerprint.
114#[derive(Debug, PartialEq, Eq, Hash)]
115struct SharedScanRequestFingerprint {
116    /// Logical columns of the projection.
117    read_columns: ReadColumns,
118    /// Column types of the projection.
119    /// We keep this to ensure we won't reuse the fingerprint after a schema change.
120    read_column_types: Vec<Option<ConcreteDataType>>,
121    /// Filters without the time index column and region partition exprs.
122    filters: Vec<String>,
123}
124
125impl ScanRequestFingerprint {
126    #[cfg(test)]
127    pub(crate) fn read_columns(&self) -> &ReadColumns {
128        &self.inner.read_columns
129    }
130
131    #[cfg(test)]
132    pub(crate) fn read_column_types(&self) -> &[Option<ConcreteDataType>] {
133        &self.inner.read_column_types
134    }
135
136    #[cfg(test)]
137    pub(crate) fn filters(&self) -> &[String] {
138        &self.inner.filters
139    }
140
141    #[cfg(test)]
142    pub(crate) fn time_filters(&self) -> &[String] {
143        self.time_filters
144            .as_deref()
145            .map(Vec::as_slice)
146            .unwrap_or(&[])
147    }
148
149    pub(crate) fn without_time_filters(&self) -> Self {
150        Self {
151            inner: Arc::clone(&self.inner),
152            time_filters: None,
153            series_row_selector: self.series_row_selector,
154            append_mode: self.append_mode,
155            filter_deleted: self.filter_deleted,
156            merge_mode: self.merge_mode,
157            partition_expr_version: self.partition_expr_version,
158        }
159    }
160
161    pub(crate) fn estimated_size(&self) -> usize {
162        mem::size_of::<SharedScanRequestFingerprint>()
163            + self.inner.read_columns.estimated_size()
164            + self.inner.read_column_types.capacity() * mem::size_of::<Option<ConcreteDataType>>()
165            + self.inner.filters.capacity() * mem::size_of::<String>()
166            + self
167                .inner
168                .filters
169                .iter()
170                .map(|filter| filter.capacity())
171                .sum::<usize>()
172            + self.time_filters.as_ref().map_or(0, |filters| {
173                mem::size_of::<Vec<String>>()
174                    + filters.capacity() * mem::size_of::<String>()
175                    + filters
176                        .iter()
177                        .map(|filter| filter.capacity())
178                        .sum::<usize>()
179            })
180    }
181}
182
183/// Cache key for range scan outputs.
184#[derive(Debug, Clone, PartialEq, Eq, Hash)]
185pub(crate) struct RangeScanCacheKey {
186    pub(crate) region_id: RegionId,
187    /// Sorted (file_id, row_group_index) pairs that uniquely identify the data this range covers.
188    pub(crate) row_groups: Vec<(FileId, i64)>,
189    pub(crate) scan: ScanRequestFingerprint,
190}
191
192impl RangeScanCacheKey {
193    pub(crate) fn estimated_size(&self) -> usize {
194        mem::size_of::<Self>()
195            + self.row_groups.capacity() * mem::size_of::<(FileId, i64)>()
196            + self.scan.estimated_size()
197    }
198}
199
200/// Cached result for one range scan.
201#[derive(Debug)]
202pub(crate) struct CachedBatchSlice {
203    batch: RecordBatch,
204    slice_lengths: Vec<usize>,
205}
206
207impl CachedBatchSlice {
208    fn metadata_size(&self) -> usize {
209        self.slice_lengths.capacity() * mem::size_of::<usize>()
210    }
211}
212
213pub(crate) struct RangeScanCacheValue {
214    cached_batches: Vec<CachedBatchSlice>,
215    /// Precomputed size of all compacted batches.
216    estimated_batches_size: usize,
217}
218
219impl RangeScanCacheValue {
220    pub(crate) fn new(
221        cached_batches: Vec<CachedBatchSlice>,
222        estimated_batches_size: usize,
223    ) -> Self {
224        Self {
225            cached_batches,
226            estimated_batches_size,
227        }
228    }
229
230    pub(crate) fn estimated_size(&self) -> usize {
231        mem::size_of::<Self>()
232            + self.cached_batches.capacity() * mem::size_of::<CachedBatchSlice>()
233            + self
234                .cached_batches
235                .iter()
236                .map(CachedBatchSlice::metadata_size)
237                .sum::<usize>()
238            + self.estimated_batches_size
239    }
240}
241
242/// Row groups and whether all sources are file-only for a partition range.
243pub(crate) struct PartitionRangeRowGroups {
244    /// Sorted (file_id, row_group_index) pairs.
245    pub(crate) row_groups: Vec<(FileId, i64)>,
246    pub(crate) only_file_sources: bool,
247}
248
249/// Collects (file_id, row_group_index) pairs from a partition range's row group indices.
250pub(crate) fn collect_partition_range_row_groups(
251    stream_ctx: &StreamContext,
252    part_range: &PartitionRange,
253) -> PartitionRangeRowGroups {
254    let range_meta = &stream_ctx.ranges[part_range.identifier];
255    let mut row_groups = Vec::new();
256    let mut only_file_sources = true;
257
258    for index in &range_meta.row_group_indices {
259        if stream_ctx.is_file_range_index(*index) {
260            let file_id = stream_ctx.input.file_from_index(*index).file_id().file_id();
261            row_groups.push((file_id, index.row_group_index));
262        } else {
263            only_file_sources = false;
264        }
265    }
266
267    row_groups.sort_unstable_by(|a, b| a.0.as_bytes().cmp(b.0.as_bytes()).then(a.1.cmp(&b.1)));
268
269    PartitionRangeRowGroups {
270        row_groups,
271        only_file_sources,
272    }
273}
274
275/// Returns the timestamp range where all time-only predicates are guaranteed true.
276///
277/// Returns `Some(min_to_max)` for empty input (vacuously true everywhere).
278/// Returns `None` if any expression contains an unsupported shape: `OR`, `NOT`,
279/// `IN`, non-literal RHS, unsupported operator, column-name mismatch, an `=`
280/// literal that cannot be represented exactly in the column unit, or overflow
281/// during bound adjustment.
282///
283/// This is intentionally stricter than `extract_time_range_from_expr` in
284/// `table::predicate`: lower bounds round up and upper bounds round down. If a
285/// partition's file-time range is contained by the returned range, every row in
286/// that partition satisfies the original time predicates.
287///
288/// `IsNull`/`IsNotNull` on the time index are not routed into `time_filters`
289/// today. If that changes, handle them here before stripping time filters from
290/// the cache key.
291pub(crate) fn implied_time_range_from_exprs(
292    ts_col_name: &str,
293    ts_col_unit: TimeUnit,
294    exprs: &[&Expr],
295) -> Option<TimestampRange> {
296    let mut acc = TimestampRange::min_to_max();
297    for expr in exprs {
298        let r = implied_time_range_from_expr(ts_col_name, ts_col_unit, expr)?;
299        acc = acc.and(&r);
300    }
301    Some(acc)
302}
303
304fn implied_time_range_from_expr(
305    ts_col_name: &str,
306    ts_col_unit: TimeUnit,
307    expr: &Expr,
308) -> Option<TimestampRange> {
309    match expr {
310        Expr::BinaryExpr(BinaryExpr { left, op, right }) => match op {
311            Operator::And => {
312                let l = implied_time_range_from_expr(ts_col_name, ts_col_unit, left)?;
313                let r = implied_time_range_from_expr(ts_col_name, ts_col_unit, right)?;
314                Some(l.and(&r))
315            }
316            Operator::Eq | Operator::Lt | Operator::LtEq | Operator::Gt | Operator::GtEq => {
317                implied_from_cmp(ts_col_name, ts_col_unit, left, *op, right)
318            }
319            // `OR` would require a strict intersection over a union of half-planes
320            // (not the loose-span union provided by `TimestampRange::or`), so we
321            // refuse it. Any other operator is unsupported.
322            _ => None,
323        },
324        Expr::Between(Between {
325            expr,
326            negated,
327            low,
328            high,
329        }) => {
330            if *negated {
331                return None;
332            }
333            implied_from_between(ts_col_name, ts_col_unit, expr, low, high)
334        }
335        // Includes `IsNull`, `IsNotNull`, `Not`, `InList`, function calls, etc.
336        _ => None,
337    }
338}
339
340fn match_ts_column_literal<'a>(
341    ts_col_name: &str,
342    left: &'a Expr,
343    right: &'a Expr,
344) -> Option<(Timestamp, bool)> {
345    let (col, scalar, reverse) = match (left, right) {
346        (Expr::Column(c), Expr::Literal(s, _)) => (c, s, false),
347        (Expr::Literal(s, _), Expr::Column(c)) => (c, s, true),
348        _ => return None,
349    };
350    if col.name != ts_col_name {
351        return None;
352    }
353    // Reject string literals: their conversion needs a timezone we do not have,
354    // and the existing extractor in `table::predicate` rejects them too.
355    if is_string_timestamp_literal(scalar) {
356        return None;
357    }
358    scalar_value_to_timestamp(scalar, None).map(|t| (t, reverse))
359}
360
361fn implied_from_cmp(
362    ts_col_name: &str,
363    ts_col_unit: TimeUnit,
364    left: &Expr,
365    op: Operator,
366    right: &Expr,
367) -> Option<TimestampRange> {
368    let (ts, reverse) = match_ts_column_literal(ts_col_name, left, right)?;
369    // Normalize to "column OP literal".
370    let op = if reverse {
371        match op {
372            Operator::Lt => Operator::Gt,
373            Operator::LtEq => Operator::GtEq,
374            Operator::Gt => Operator::Lt,
375            Operator::GtEq => Operator::LtEq,
376            Operator::Eq => Operator::Eq,
377            _ => return None,
378        }
379    } else {
380        op
381    };
382
383    match op {
384        Operator::GtEq => {
385            // ts >= L. Round the lower bound up in the column unit.
386            let b = ts.convert_to_ceil(ts_col_unit)?;
387            Some(TimestampRange::from_start(b))
388        }
389        Operator::Gt => {
390            // ts > L. floor(L) + 1 is the tight lower bound in the column unit.
391            let v = ts.convert_to(ts_col_unit)?.value().checked_add(1)?;
392            Some(TimestampRange::from_start(Timestamp::new(v, ts_col_unit)))
393        }
394        Operator::LtEq => {
395            // ts <= L. Round the upper bound down in the column unit.
396            let b = ts.convert_to(ts_col_unit)?;
397            Some(TimestampRange::until_end(b, true))
398        }
399        Operator::Lt => {
400            // ts < L. `ts < ceil(L)` is the tight bound: equal to `ts < L` when
401            // L is exactly representable, and `ts <= floor(L)` otherwise.
402            let b = ts.convert_to_ceil(ts_col_unit)?;
403            Some(TimestampRange::until_end(b, false))
404        }
405        Operator::Eq => {
406            // ts = L. Only provable when L is exactly representable.
407            let f = ts.convert_to(ts_col_unit)?;
408            let c = ts.convert_to_ceil(ts_col_unit)?;
409            if f.value() != c.value() {
410                return None;
411            }
412            Some(TimestampRange::single(f))
413        }
414        _ => None,
415    }
416}
417
418fn implied_from_between(
419    ts_col_name: &str,
420    ts_col_unit: TimeUnit,
421    expr: &Expr,
422    low: &Expr,
423    high: &Expr,
424) -> Option<TimestampRange> {
425    let Expr::Column(c) = expr else {
426        return None;
427    };
428    if c.name != ts_col_name {
429        return None;
430    }
431    let (low_s, high_s) = match (low, high) {
432        (Expr::Literal(l, _), Expr::Literal(h, _)) => (l, h),
433        _ => return None,
434    };
435    if is_string_timestamp_literal(low_s) || is_string_timestamp_literal(high_s) {
436        return None;
437    }
438    let low_ts = scalar_value_to_timestamp(low_s, None)?;
439    let high_ts = scalar_value_to_timestamp(high_s, None)?;
440    // BETWEEN low AND high is equivalent to ts >= low AND ts <= high.
441    let lo = low_ts.convert_to_ceil(ts_col_unit)?;
442    let hi = high_ts.convert_to(ts_col_unit)?;
443    Some(TimestampRange::new_inclusive(Some(lo), Some(hi)))
444}
445
446/// Builds a cache key for the given partition range if it is eligible for caching.
447pub(crate) fn build_range_cache_key(
448    stream_ctx: &StreamContext,
449    part_range: &PartitionRange,
450) -> Option<RangeScanCacheKey> {
451    if !stream_ctx.input.cache_strategy.has_range_result_cache() {
452        return None;
453    }
454
455    let fingerprint = stream_ctx.scan_fingerprint.as_ref()?;
456
457    // Dyn filters can change at runtime, so we can't cache when they're present.
458    let has_dyn_filters = stream_ctx
459        .input
460        .predicate_group()
461        .predicate_without_region()
462        .is_some_and(|p| !p.dyn_filters().is_empty());
463    if has_dyn_filters {
464        return None;
465    }
466
467    let rg = collect_partition_range_row_groups(stream_ctx, part_range);
468    if !rg.only_file_sources || rg.row_groups.is_empty() {
469        return None;
470    }
471
472    // If the implied range covers this partition's `FileTimeRange`, drop
473    // time-only predicates from the cache key so that queries with different
474    // but equally-covering time bounds share an entry. `None` means some
475    // time-only predicate had an unsupported shape (e.g. `OR`), so we keep
476    // them in the key.
477    let range_meta = &stream_ctx.ranges[part_range.identifier];
478    let (file_min, file_max) = range_meta.time_range;
479    let covers = match &stream_ctx.scan_implied_time_range {
480        // An empty implied range can never cover a non-empty file range, so
481        // short-circuit. We also skip the unit asserts because
482        // `TimestampRange::empty()` uses `Timestamp::default()` (millisecond),
483        // which would falsely trip the asserts for non-ms time index columns.
484        Some(implied) if !implied.is_empty() => {
485            // The `contains` check is sound only when `file_min`/`file_max`
486            // share the implied range's unit (the time index column's unit).
487            // Mito stores time index values in that unit; assert to catch any
488            // future drift.
489            if let Some(ts) = implied.start().as_ref().or(implied.end().as_ref()) {
490                assert_eq!(file_min.unit(), ts.unit());
491                assert_eq!(file_max.unit(), ts.unit());
492            }
493            implied.contains(&file_min) && implied.contains(&file_max)
494        }
495        _ => false,
496    };
497    let scan = if covers {
498        fingerprint.without_time_filters()
499    } else {
500        fingerprint.clone()
501    };
502
503    Some(RangeScanCacheKey {
504        region_id: stream_ctx.input.region_metadata().region_id,
505        row_groups: rg.row_groups,
506        scan,
507    })
508}
509
510/// Returns a stream that replays cached record batches.
511pub(crate) fn cached_flat_range_stream(value: Arc<RangeScanCacheValue>) -> BoxedRecordBatchStream {
512    Box::pin(try_stream! {
513        for cached_batch in &value.cached_batches {
514            let mut offset = 0;
515            for &len in &cached_batch.slice_lengths {
516                yield cached_batch.batch.slice(offset, len);
517                offset += len;
518            }
519        }
520    })
521}
522
523enum CacheConcatCommand {
524    Compact(Vec<RecordBatch>),
525    Finish {
526        pending: Vec<RecordBatch>,
527        key: RangeScanCacheKey,
528        cache_strategy: CacheStrategy,
529        part_metrics: PartitionMetrics,
530        result_tx: Option<oneshot::Sender<Result<Arc<RangeScanCacheValue>>>>,
531    },
532}
533
534#[derive(Default)]
535struct CacheConcatState {
536    cached_batches: Vec<CachedBatchSlice>,
537    estimated_size: usize,
538}
539
540impl CacheConcatState {
541    async fn compact(
542        &mut self,
543        batches: Vec<RecordBatch>,
544        limiter: &crate::cache::RangeResultMemoryLimiter,
545    ) -> Result<()> {
546        if batches.is_empty() {
547            return Ok(());
548        }
549
550        let input_size = batches
551            .iter()
552            .map(RecordBatch::get_array_memory_size)
553            .sum::<usize>();
554        let _permit = limiter.acquire(input_size).await?;
555
556        let compacted = compact_record_batches(batches)?;
557        self.estimated_size += compacted.batch.get_array_memory_size();
558        self.cached_batches.push(compacted);
559        Ok(())
560    }
561
562    fn finish(self) -> RangeScanCacheValue {
563        RangeScanCacheValue::new(self.cached_batches, self.estimated_size)
564    }
565}
566
567fn compact_record_batches(batches: Vec<RecordBatch>) -> Result<CachedBatchSlice> {
568    debug_assert!(!batches.is_empty());
569
570    let slice_lengths = batches.iter().map(RecordBatch::num_rows).collect();
571    build_cached_batch_slice(batches, slice_lengths)
572}
573
574fn build_cached_batch_slice(
575    batches: Vec<RecordBatch>,
576    slice_lengths: Vec<usize>,
577) -> Result<CachedBatchSlice> {
578    let batch = if batches.len() == 1 {
579        batches.into_iter().next().unwrap()
580    } else {
581        let schema = batches[0].schema();
582        concat_batches(&schema, &batches).context(ComputeArrowSnafu)?
583    };
584
585    Ok(CachedBatchSlice {
586        batch,
587        slice_lengths,
588    })
589}
590
591async fn run_cache_concat_task(
592    mut rx: mpsc::UnboundedReceiver<CacheConcatCommand>,
593    limiter: Arc<crate::cache::RangeResultMemoryLimiter>,
594    skip_threshold_bytes: usize,
595) {
596    let mut state = CacheConcatState::default();
597
598    while let Some(cmd) = rx.recv().await {
599        match cmd {
600            CacheConcatCommand::Compact(batches) => {
601                if let Err(err) = state.compact(batches, &limiter).await {
602                    warn!(err; "Failed to compact range cache batches");
603                    return;
604                }
605                // Close the channel to stop further work as soon as the cached
606                // size exceeds the configured cache budget.
607                if state.estimated_size > skip_threshold_bytes {
608                    return;
609                }
610            }
611            CacheConcatCommand::Finish {
612                pending,
613                key,
614                cache_strategy,
615                part_metrics,
616                result_tx,
617            } => {
618                let compact_result = state
619                    .compact(pending, &limiter)
620                    .await
621                    .map(|()| state.finish());
622                let result = match compact_result {
623                    Ok(v) => {
624                        let value = Arc::new(v);
625                        part_metrics
626                            .inc_range_cache_size(key.estimated_size() + value.estimated_size());
627                        cache_strategy.put_range_result(key, value.clone());
628
629                        Ok(value)
630                    }
631                    Err(e) => {
632                        warn!(e; "Failed to finalize range cache batches");
633
634                        Err(e)
635                    }
636                };
637
638                if let Some(tx) = result_tx {
639                    let _ = tx.send(result);
640                }
641
642                break;
643            }
644        }
645    }
646}
647
648struct CacheBatchBuffer {
649    buffered_batches: Vec<RecordBatch>,
650    buffered_rows: usize,
651    buffered_size: usize,
652    sender: Option<mpsc::UnboundedSender<CacheConcatCommand>>,
653}
654
655impl CacheBatchBuffer {
656    fn new(cache_strategy: &CacheStrategy) -> Self {
657        let sender = cache_strategy.range_result_memory_limiter().map(|limiter| {
658            let skip_threshold_bytes = cache_strategy.range_result_cache_size().unwrap_or(0);
659            let (tx, rx) = mpsc::unbounded_channel();
660            common_runtime::spawn_global(run_cache_concat_task(
661                rx,
662                limiter.clone(),
663                skip_threshold_bytes,
664            ));
665            tx
666        });
667
668        Self {
669            buffered_batches: Vec::new(),
670            buffered_rows: 0,
671            buffered_size: 0,
672            sender,
673        }
674    }
675
676    fn push(&mut self, batch: RecordBatch) -> Result<()> {
677        if self.sender.is_none() {
678            return Ok(());
679        }
680
681        self.buffered_rows += batch.num_rows();
682        self.buffered_size += batch.get_array_memory_size();
683        self.buffered_batches.push(batch);
684
685        if self.buffered_batches.len() > 1
686            && (self.buffered_rows > DEFAULT_READ_BATCH_SIZE
687                || self.buffered_size > RANGE_CACHE_COMPACT_THRESHOLD_BYTES)
688        {
689            self.notify_compact();
690        }
691
692        Ok(())
693    }
694
695    fn notify_compact(&mut self) {
696        if self.buffered_batches.is_empty() || self.sender.is_none() {
697            return;
698        }
699
700        let batches = mem::take(&mut self.buffered_batches);
701        self.buffered_rows = 0;
702        self.buffered_size = 0;
703
704        let Some(sender) = &self.sender else {
705            return;
706        };
707        if sender.send(CacheConcatCommand::Compact(batches)).is_err() {
708            self.sender = None;
709        }
710    }
711
712    fn finish(
713        mut self,
714        key: RangeScanCacheKey,
715        cache_strategy: CacheStrategy,
716        part_metrics: PartitionMetrics,
717        result_tx: Option<oneshot::Sender<Result<Arc<RangeScanCacheValue>>>>,
718    ) {
719        let Some(sender) = self.sender.take() else {
720            return;
721        };
722
723        if sender
724            .send(CacheConcatCommand::Finish {
725                pending: mem::take(&mut self.buffered_batches),
726                key,
727                cache_strategy,
728                part_metrics,
729                result_tx,
730            })
731            .is_err()
732        {
733            self.sender = None;
734        }
735    }
736}
737
738/// Wraps a stream to cache its output for future range cache hits.
739pub(crate) fn cache_flat_range_stream(
740    mut stream: BoxedRecordBatchStream,
741    cache_strategy: CacheStrategy,
742    key: RangeScanCacheKey,
743    part_metrics: PartitionMetrics,
744) -> BoxedRecordBatchStream {
745    Box::pin(try_stream! {
746        let mut buffer = CacheBatchBuffer::new(&cache_strategy);
747        while let Some(batch) = stream.try_next().await? {
748            buffer.push(batch.clone())?;
749            yield batch;
750        }
751
752        buffer.finish(key, cache_strategy, part_metrics, None);
753    })
754}
755
756/// Creates a `cache_flat_range_stream` with dummy internals for benchmarking.
757///
758/// This avoids exposing `RangeScanCacheKey`, `ScanRequestFingerprint`, and
759/// `PartitionMetrics` publicly.
760#[cfg(feature = "test")]
761pub fn bench_cache_flat_range_stream(
762    stream: BoxedRecordBatchStream,
763    cache_size_bytes: u64,
764    region_id: RegionId,
765) -> BoxedRecordBatchStream {
766    use std::time::Instant;
767
768    use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
769
770    use crate::region::options::MergeMode;
771
772    let cache_manager = Arc::new(
773        crate::cache::CacheManager::builder()
774            .range_result_cache_size(cache_size_bytes)
775            .build(),
776    );
777    let cache_strategy = CacheStrategy::EnableAll(cache_manager);
778
779    let fingerprint = ScanRequestFingerprintBuilder {
780        read_columns: ReadColumns::from_deduped_column_ids(std::iter::empty()),
781        read_column_types: vec![],
782        filters: vec![],
783        time_filters: vec![],
784        series_row_selector: None,
785        append_mode: false,
786        filter_deleted: false,
787        merge_mode: MergeMode::LastRow,
788        partition_expr_version: 0,
789    }
790    .build();
791
792    let key = RangeScanCacheKey {
793        region_id,
794        row_groups: vec![],
795        scan: fingerprint,
796    };
797
798    let metrics_set = ExecutionPlanMetricsSet::new();
799    let part_metrics =
800        PartitionMetrics::new(region_id, 0, "bench", Instant::now(), false, &metrics_set);
801
802    cache_flat_range_stream(stream, cache_strategy, key, part_metrics)
803}
804
805#[cfg(test)]
806mod tests {
807    use std::sync::Arc;
808    use std::time::Instant;
809
810    use common_time::Timestamp;
811    use common_time::range::TimestampRange;
812    use common_time::timestamp::TimeUnit;
813    use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
814    use datafusion_common::ScalarValue;
815    use datafusion_expr::{Expr, col, lit};
816    use smallvec::smallvec;
817    use store_api::storage::{FileId, RegionId};
818
819    use super::*;
820    use crate::cache::CacheManager;
821    use crate::read::flat_projection::FlatProjectionMapper;
822    use crate::read::range::{RangeMeta, RowGroupIndex, SourceIndex};
823    use crate::read::scan_region::{PredicateGroup, ScanInput};
824    use crate::sst::file::FileTimeRange;
825    use crate::test_util::memtable_util::metadata_with_primary_key;
826    use crate::test_util::scheduler_util::SchedulerEnv;
827    use crate::test_util::sst_util::sst_file_handle_with_file_id;
828
829    fn test_cache_strategy() -> CacheStrategy {
830        CacheStrategy::EnableAll(Arc::new(
831            CacheManager::builder()
832                .range_result_cache_size(1024 * 1024)
833                .build(),
834        ))
835    }
836
837    fn test_scan_fingerprint(
838        filters: Vec<String>,
839        time_filters: Vec<String>,
840        series_row_selector: Option<TimeSeriesRowSelector>,
841        filter_deleted: bool,
842        partition_expr_version: u64,
843    ) -> ScanRequestFingerprint {
844        let read_columns = ReadColumns::from_deduped_column_ids([1, 2]);
845        ScanRequestFingerprintBuilder {
846            read_columns,
847            read_column_types: vec![None, None],
848            filters,
849            time_filters,
850            series_row_selector,
851            append_mode: false,
852            filter_deleted,
853            merge_mode: MergeMode::LastRow,
854            partition_expr_version,
855        }
856        .build()
857    }
858
859    fn test_cache_context(strategy: &CacheStrategy) -> (RangeScanCacheKey, PartitionMetrics) {
860        let region_id = RegionId::new(1, 1);
861        let key = RangeScanCacheKey {
862            region_id,
863            row_groups: vec![],
864            scan: test_scan_fingerprint(vec![], vec![], None, false, 0),
865        };
866
867        let metrics_set = ExecutionPlanMetricsSet::new();
868        let part_metrics =
869            PartitionMetrics::new(region_id, 0, "test", Instant::now(), false, &metrics_set);
870
871        assert!(strategy.get_range_result(&key).is_none());
872        (key, part_metrics)
873    }
874
875    async fn finish_cache_batch_buffer(
876        buffer: CacheBatchBuffer,
877        key: RangeScanCacheKey,
878        cache_strategy: CacheStrategy,
879        part_metrics: PartitionMetrics,
880    ) -> Result<Arc<RangeScanCacheValue>> {
881        let (tx, rx) = oneshot::channel();
882        common_telemetry::info!("finish start");
883        buffer.finish(key, cache_strategy, part_metrics, Some(tx));
884        common_telemetry::info!("finish end");
885        rx.await.context(crate::error::RecvSnafu)?
886    }
887
888    async fn new_stream_context(
889        filters: Vec<Expr>,
890        query_time_range: Option<TimestampRange>,
891        partition_time_range: FileTimeRange,
892    ) -> (StreamContext, PartitionRange) {
893        let env = SchedulerEnv::new().await;
894        let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
895        let mapper = FlatProjectionMapper::new(&metadata, [0, 2, 3]).unwrap();
896        let predicate = PredicateGroup::new(metadata.as_ref(), &filters).unwrap();
897        let file_id = FileId::random();
898        let file = sst_file_handle_with_file_id(
899            file_id,
900            partition_time_range.0.value(),
901            partition_time_range.1.value(),
902        );
903        let input = ScanInput::new(env.access_layer.clone(), mapper)
904            .with_predicate(predicate)
905            .with_time_range(query_time_range)
906            .with_files(vec![file])
907            .with_cache(test_cache_strategy());
908        let range_meta = RangeMeta {
909            time_range: partition_time_range,
910            indices: smallvec![SourceIndex {
911                index: 0,
912                num_row_groups: 1,
913            }],
914            row_group_indices: smallvec![RowGroupIndex {
915                index: 0,
916                row_group_index: 0,
917            }],
918            num_rows: 10,
919        };
920        let partition_range = range_meta.new_partition_range(0);
921        let (scan_fingerprint, scan_implied_time_range) =
922            match crate::read::scan_region::build_scan_fingerprint(&input) {
923                Some(b) => (Some(b.fingerprint), b.implied_time_range),
924                None => (None, None),
925            };
926        let stream_ctx = StreamContext {
927            input,
928            ranges: vec![range_meta],
929            scan_fingerprint,
930            scan_implied_time_range,
931            query_start: Instant::now(),
932        };
933
934        (stream_ctx, partition_range)
935    }
936
937    /// Helper to create a timestamp millisecond literal.
938    fn ts_lit(val: i64) -> Expr {
939        lit(ScalarValue::TimestampMillisecond(Some(val), None))
940    }
941
942    fn normalized_exprs(exprs: impl IntoIterator<Item = Expr>) -> Vec<String> {
943        let mut exprs = exprs
944            .into_iter()
945            .map(|expr| expr.to_string())
946            .collect::<Vec<_>>();
947        exprs.sort_unstable();
948        exprs
949    }
950
951    async fn assert_range_cache_filters(
952        filters: Vec<Expr>,
953        query_time_range: Option<TimestampRange>,
954        partition_time_range: FileTimeRange,
955        expected_filters: Vec<Expr>,
956        expected_time_filters: Vec<Expr>,
957    ) {
958        let (stream_ctx, part_range) =
959            new_stream_context(filters, query_time_range, partition_time_range).await;
960
961        let key = build_range_cache_key(&stream_ctx, &part_range).unwrap();
962
963        assert_eq!(
964            key.scan.filters(),
965            normalized_exprs(expected_filters).as_slice()
966        );
967        assert_eq!(
968            key.scan.time_filters(),
969            normalized_exprs(expected_time_filters).as_slice()
970        );
971    }
972
973    #[tokio::test]
974    async fn range_cache_time_filter_key_cases() {
975        let partition = (
976            Timestamp::new_millisecond(1000),
977            Timestamp::new_millisecond(2000),
978        );
979
980        struct Case {
981            filters: Vec<Expr>,
982            query_time_range: Option<TimestampRange>,
983            expected_filters: Vec<Expr>,
984            expected_time_filters: Vec<Expr>,
985        }
986
987        // Time filters are stripped only when their implied range fully covers
988        // the partition's file-time range. `is_not_null(ts)` stays in regular
989        // filters because it is not routed into `time_filters`.
990        for case in [
991            Case {
992                filters: vec![
993                    col("ts").gt_eq(ts_lit(1000)),
994                    col("ts").lt(ts_lit(2001)),
995                    col("ts").is_not_null(),
996                    col("k0").eq(lit("foo")),
997                ],
998                query_time_range: TimestampRange::with_unit(1000, 2002, TimeUnit::Millisecond),
999                expected_filters: vec![col("k0").eq(lit("foo")), col("ts").is_not_null()],
1000                expected_time_filters: vec![],
1001            },
1002            Case {
1003                filters: vec![
1004                    col("ts").gt_eq(ts_lit(500)),
1005                    col("ts").lt(ts_lit(3000)),
1006                    col("k0").eq(lit("foo")),
1007                ],
1008                query_time_range: TimestampRange::with_unit(500, 3000, TimeUnit::Millisecond),
1009                expected_filters: vec![col("k0").eq(lit("foo"))],
1010                expected_time_filters: vec![],
1011            },
1012            Case {
1013                filters: vec![
1014                    col("ts").gt_eq(ts_lit(1000)),
1015                    col("ts").lt_eq(ts_lit(2000)),
1016                    col("k0").eq(lit("foo")),
1017                ],
1018                query_time_range: TimestampRange::with_unit(1000, 2001, TimeUnit::Millisecond),
1019                expected_filters: vec![col("k0").eq(lit("foo"))],
1020                expected_time_filters: vec![],
1021            },
1022            Case {
1023                filters: vec![
1024                    col("ts").between(ts_lit(1000), ts_lit(2000)),
1025                    col("k0").eq(lit("foo")),
1026                ],
1027                query_time_range: TimestampRange::with_unit(1000, 2001, TimeUnit::Millisecond),
1028                expected_filters: vec![col("k0").eq(lit("foo"))],
1029                expected_time_filters: vec![],
1030            },
1031            Case {
1032                filters: vec![col("ts").gt_eq(ts_lit(1200)), col("k0").eq(lit("foo"))],
1033                query_time_range: TimestampRange::with_unit(1200, 2001, TimeUnit::Millisecond),
1034                expected_filters: vec![col("k0").eq(lit("foo"))],
1035                expected_time_filters: vec![col("ts").gt_eq(ts_lit(1200))],
1036            },
1037            Case {
1038                filters: vec![
1039                    col("ts").gt_eq(ts_lit(1500)),
1040                    col("ts").is_not_null(),
1041                    col("k0").eq(lit("foo")),
1042                ],
1043                query_time_range: None,
1044                expected_filters: vec![col("k0").eq(lit("foo")), col("ts").is_not_null()],
1045                expected_time_filters: vec![col("ts").gt_eq(ts_lit(1500))],
1046            },
1047        ] {
1048            assert_range_cache_filters(
1049                case.filters,
1050                case.query_time_range,
1051                partition,
1052                case.expected_filters,
1053                case.expected_time_filters,
1054            )
1055            .await;
1056        }
1057    }
1058
1059    #[tokio::test]
1060    async fn two_distinct_queries_share_cache_key_when_both_cover() {
1061        let partition_range = (
1062            Timestamp::new_millisecond(1000),
1063            Timestamp::new_millisecond(2000),
1064        );
1065
1066        let (ctx_a, part_a) = new_stream_context(
1067            vec![
1068                col("ts").gt_eq(ts_lit(500)),
1069                col("ts").lt(ts_lit(3000)),
1070                col("k0").eq(lit("foo")),
1071            ],
1072            TimestampRange::with_unit(500, 3000, TimeUnit::Millisecond),
1073            partition_range,
1074        )
1075        .await;
1076        let (ctx_b, part_b) = new_stream_context(
1077            vec![
1078                col("ts").gt_eq(ts_lit(100)),
1079                col("ts").lt(ts_lit(5000)),
1080                col("k0").eq(lit("foo")),
1081            ],
1082            TimestampRange::with_unit(100, 5000, TimeUnit::Millisecond),
1083            partition_range,
1084        )
1085        .await;
1086
1087        let key_a = build_range_cache_key(&ctx_a, &part_a).unwrap();
1088        let key_b = build_range_cache_key(&ctx_b, &part_b).unwrap();
1089        assert_eq!(key_a.scan, key_b.scan);
1090        assert!(key_a.scan.time_filters().is_empty());
1091    }
1092
1093    #[tokio::test]
1094    async fn disables_optimization_on_or_clause() {
1095        let partition_range = (
1096            Timestamp::new_millisecond(1000),
1097            Timestamp::new_millisecond(2000),
1098        );
1099
1100        let or_a = col("ts").gt_eq(ts_lit(1000)).or(col("ts").lt(ts_lit(500)));
1101        let or_b = col("ts").gt_eq(ts_lit(900)).or(col("ts").lt(ts_lit(400)));
1102
1103        let (ctx_a, part_a) = new_stream_context(
1104            vec![or_a.clone(), col("k0").eq(lit("foo"))],
1105            None,
1106            partition_range,
1107        )
1108        .await;
1109        let (ctx_b, part_b) = new_stream_context(
1110            vec![or_b.clone(), col("k0").eq(lit("foo"))],
1111            None,
1112            partition_range,
1113        )
1114        .await;
1115
1116        assert!(ctx_a.scan_implied_time_range.is_none());
1117        let key_a = build_range_cache_key(&ctx_a, &part_a).unwrap();
1118        let key_b = build_range_cache_key(&ctx_b, &part_b).unwrap();
1119        assert_ne!(key_a.scan, key_b.scan);
1120        assert_eq!(
1121            key_a.scan.time_filters(),
1122            normalized_exprs([or_a]).as_slice()
1123        );
1124    }
1125
1126    #[tokio::test]
1127    async fn empty_implied_range_does_not_panic_on_non_ms_file_range() {
1128        // Contradictory time predicates make the implied range empty. The
1129        // empty range's sentinel timestamps use `Timestamp::default()` (ms),
1130        // so without the `is_empty()` short-circuit the unit asserts would
1131        // panic against a non-ms `range_meta.time_range`.
1132        let partition = (
1133            Timestamp::new_millisecond(1000),
1134            Timestamp::new_millisecond(2000),
1135        );
1136
1137        let (mut ctx, part_range) = new_stream_context(
1138            vec![col("ts").gt_eq(ts_lit(1500)), col("k0").eq(lit("foo"))],
1139            TimestampRange::with_unit(1500, 3000, TimeUnit::Millisecond),
1140            partition,
1141        )
1142        .await;
1143
1144        ctx.scan_implied_time_range = Some(TimestampRange::empty());
1145        ctx.ranges[0].time_range = (
1146            Timestamp::new(1_000_000_000, TimeUnit::Nanosecond),
1147            Timestamp::new(2_000_000_000, TimeUnit::Nanosecond),
1148        );
1149
1150        let key = build_range_cache_key(&ctx, &part_range).unwrap();
1151        // Empty implied range cannot cover, so time filters stay in the key.
1152        assert!(!key.scan.time_filters().is_empty());
1153    }
1154
1155    fn ms_ts(v: i64) -> Timestamp {
1156        Timestamp::new_millisecond(v)
1157    }
1158
1159    fn implied_ms(expr: Expr) -> Option<TimestampRange> {
1160        implied_time_range_from_exprs("ts", TimeUnit::Millisecond, &[&expr])
1161    }
1162
1163    #[test]
1164    fn implied_time_range_supported_exprs() {
1165        for (expr, expected) in [
1166            (
1167                col("ts").gt_eq(ts_lit(1000)),
1168                Some(TimestampRange::from_start(ms_ts(1000))),
1169            ),
1170            (
1171                col("ts").gt(ts_lit(1000)),
1172                Some(TimestampRange::from_start(ms_ts(1001))),
1173            ),
1174            (
1175                col("ts").lt_eq(ts_lit(2000)),
1176                Some(TimestampRange::until_end(ms_ts(2000), true)),
1177            ),
1178            (
1179                col("ts").lt(ts_lit(2000)),
1180                Some(TimestampRange::until_end(ms_ts(2000), false)),
1181            ),
1182            (
1183                col("ts").eq(ts_lit(1500)),
1184                Some(TimestampRange::single(ms_ts(1500))),
1185            ),
1186            (
1187                ts_lit(1000).lt_eq(col("ts")),
1188                Some(TimestampRange::from_start(ms_ts(1000))),
1189            ),
1190            (
1191                col("ts").between(ts_lit(1000), ts_lit(2000)),
1192                Some(TimestampRange::new_inclusive(
1193                    Some(ms_ts(1000)),
1194                    Some(ms_ts(2000)),
1195                )),
1196            ),
1197            (
1198                col("ts")
1199                    .gt_eq(ts_lit(1000))
1200                    .and(col("ts").lt(ts_lit(2000))),
1201                TimestampRange::with_unit(1000, 2000, TimeUnit::Millisecond),
1202            ),
1203            (
1204                col("ts")
1205                    .gt_eq(ts_lit(1000))
1206                    .and(col("ts").lt(ts_lit(5000)))
1207                    .and(col("ts").lt_eq(ts_lit(3000))),
1208                TimestampRange::with_unit(1000, 3001, TimeUnit::Millisecond),
1209            ),
1210        ] {
1211            assert_eq!(implied_ms(expr), expected);
1212        }
1213
1214        assert_eq!(
1215            implied_time_range_from_exprs("ts", TimeUnit::Millisecond, &[]),
1216            Some(TimestampRange::min_to_max())
1217        );
1218    }
1219
1220    #[test]
1221    fn implied_time_range_unsupported_exprs() {
1222        let not_between = Expr::Between(Between {
1223            expr: Box::new(col("ts")),
1224            negated: true,
1225            low: Box::new(ts_lit(1000)),
1226            high: Box::new(ts_lit(2000)),
1227        });
1228
1229        for expr in [
1230            not_between,
1231            col("ts").gt_eq(ts_lit(1000)).or(col("ts").lt(ts_lit(500))),
1232            Expr::Not(Box::new(col("ts").gt_eq(ts_lit(1000)))),
1233            col("ts").in_list(vec![ts_lit(1000), ts_lit(2000)], false),
1234            col("ts").gt_eq(col("other")),
1235            col("other_ts").gt_eq(ts_lit(1000)),
1236        ] {
1237            assert!(implied_ms(expr).is_none());
1238        }
1239    }
1240
1241    #[test]
1242    fn implied_time_range_unit_conversion() {
1243        let second_1 = lit(ScalarValue::TimestampSecond(Some(1), None));
1244        let ns_1500 = lit(ScalarValue::TimestampNanosecond(Some(1_500_000_000), None));
1245        let ns_1500_5 = lit(ScalarValue::TimestampNanosecond(Some(1_500_500_000), None));
1246
1247        for (expr, expected) in [
1248            (
1249                col("ts").gt_eq(second_1.clone()),
1250                Some(TimestampRange::from_start(ms_ts(1000))),
1251            ),
1252            (
1253                col("ts").lt_eq(second_1),
1254                Some(TimestampRange::until_end(ms_ts(1000), true)),
1255            ),
1256            (
1257                col("ts").eq(ns_1500),
1258                Some(TimestampRange::single(ms_ts(1500))),
1259            ),
1260            (col("ts").eq(ns_1500_5.clone()), None),
1261            (
1262                col("ts").gt_eq(ns_1500_5.clone()),
1263                Some(TimestampRange::from_start(ms_ts(1501))),
1264            ),
1265            (
1266                col("ts").lt_eq(ns_1500_5.clone()),
1267                Some(TimestampRange::until_end(ms_ts(1500), true)),
1268            ),
1269            (
1270                col("ts").gt(ns_1500_5.clone()),
1271                Some(TimestampRange::from_start(ms_ts(1501))),
1272            ),
1273            (
1274                col("ts").lt(ns_1500_5),
1275                Some(TimestampRange::until_end(ms_ts(1501), false)),
1276            ),
1277        ] {
1278            assert_eq!(implied_ms(expr), expected);
1279        }
1280    }
1281
1282    #[test]
1283    fn normalizes_and_clears_time_filters() {
1284        let normalized =
1285            test_scan_fingerprint(vec!["k0 = 'foo'".to_string()], vec![], None, true, 0);
1286
1287        assert!(normalized.time_filters().is_empty());
1288
1289        let fingerprint = test_scan_fingerprint(
1290            vec!["k0 = 'foo'".to_string()],
1291            vec!["ts >= 1000".to_string()],
1292            Some(TimeSeriesRowSelector::LastRow),
1293            true,
1294            7,
1295        );
1296
1297        let reset = fingerprint.without_time_filters();
1298
1299        assert_eq!(reset.read_columns(), fingerprint.read_columns());
1300        assert_eq!(reset.read_column_types(), fingerprint.read_column_types());
1301        assert_eq!(reset.filters(), fingerprint.filters());
1302        assert!(reset.time_filters().is_empty());
1303        assert_eq!(reset.series_row_selector, fingerprint.series_row_selector);
1304        assert_eq!(reset.append_mode, fingerprint.append_mode);
1305        assert_eq!(reset.filter_deleted, fingerprint.filter_deleted);
1306        assert_eq!(reset.merge_mode, fingerprint.merge_mode);
1307        assert_eq!(
1308            reset.partition_expr_version,
1309            fingerprint.partition_expr_version
1310        );
1311    }
1312
1313    fn test_schema() -> Arc<datatypes::arrow::datatypes::Schema> {
1314        use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema};
1315
1316        Arc::new(Schema::new(vec![Field::new(
1317            "value",
1318            ArrowDataType::Int64,
1319            false,
1320        )]))
1321    }
1322
1323    fn make_batch(values: &[i64]) -> RecordBatch {
1324        use datatypes::arrow::array::Int64Array;
1325
1326        RecordBatch::try_new(
1327            test_schema(),
1328            vec![Arc::new(Int64Array::from(values.to_vec()))],
1329        )
1330        .unwrap()
1331    }
1332
1333    fn make_large_binary_batch(rows: usize, bytes_per_row: usize) -> RecordBatch {
1334        use datatypes::arrow::array::BinaryArray;
1335        use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema};
1336
1337        let schema = Arc::new(Schema::new(vec![Field::new(
1338            "value",
1339            ArrowDataType::Binary,
1340            false,
1341        )]));
1342        let payload = vec![b'x'; bytes_per_row];
1343        let values = (0..rows).map(|_| payload.as_slice()).collect::<Vec<_>>();
1344
1345        RecordBatch::try_new(schema, vec![Arc::new(BinaryArray::from_vec(values))]).unwrap()
1346    }
1347
1348    #[test]
1349    fn compact_record_batches_keeps_original_boundaries() {
1350        let batches = vec![make_batch(&[1, 2]), make_batch(&[3]), make_batch(&[4, 5])];
1351
1352        let compacted = compact_record_batches(batches).unwrap();
1353
1354        assert_eq!(compacted.batch.num_rows(), 5);
1355        assert_eq!(compacted.slice_lengths, vec![2, 1, 2]);
1356    }
1357
1358    #[tokio::test]
1359    async fn cached_flat_range_stream_replays_original_batches() {
1360        let value = Arc::new(RangeScanCacheValue::new(
1361            vec![CachedBatchSlice {
1362                batch: make_batch(&[1, 2, 3]),
1363                slice_lengths: vec![2, 1],
1364            }],
1365            make_batch(&[1, 2, 3]).get_array_memory_size(),
1366        ));
1367
1368        let replayed = cached_flat_range_stream(value)
1369            .try_collect::<Vec<_>>()
1370            .await
1371            .unwrap();
1372
1373        assert_eq!(replayed.len(), 2);
1374        assert_eq!(replayed[0].num_rows(), 2);
1375        assert_eq!(replayed[1].num_rows(), 1);
1376    }
1377
1378    #[tokio::test]
1379    async fn cache_batch_buffer_finishes_pending_batches() {
1380        let strategy = test_cache_strategy();
1381        let batch = make_batch(&[1, 2, 3]);
1382        let expected_size = batch.get_array_memory_size();
1383        let (key, part_metrics) = test_cache_context(&strategy);
1384
1385        let mut buffer = CacheBatchBuffer::new(&strategy);
1386        buffer.push(batch).unwrap();
1387
1388        let value = finish_cache_batch_buffer(buffer, key.clone(), strategy.clone(), part_metrics)
1389            .await
1390            .unwrap();
1391        assert_eq!(value.cached_batches.len(), 1);
1392        assert_eq!(value.cached_batches[0].slice_lengths, vec![3]);
1393        assert_eq!(value.estimated_batches_size, expected_size);
1394        assert!(Arc::ptr_eq(
1395            &value,
1396            &strategy.get_range_result(&key).unwrap()
1397        ));
1398    }
1399
1400    #[tokio::test]
1401    async fn cache_batch_buffer_compacts_when_rows_exceed_default_batch_size() {
1402        let strategy = test_cache_strategy();
1403        let batch = make_batch(&vec![1; DEFAULT_READ_BATCH_SIZE / 2 + 1]);
1404        let (key, part_metrics) = test_cache_context(&strategy);
1405
1406        let mut buffer = CacheBatchBuffer::new(&strategy);
1407        buffer.push(batch.clone()).unwrap();
1408        buffer.push(batch).unwrap();
1409
1410        assert_eq!(buffer.buffered_rows, 0);
1411        assert!(buffer.buffered_batches.is_empty());
1412
1413        let value = finish_cache_batch_buffer(buffer, key, strategy, part_metrics)
1414            .await
1415            .unwrap();
1416        assert_eq!(value.cached_batches.len(), 1);
1417        assert_eq!(
1418            value.cached_batches[0].slice_lengths,
1419            vec![
1420                DEFAULT_READ_BATCH_SIZE / 2 + 1,
1421                DEFAULT_READ_BATCH_SIZE / 2 + 1
1422            ]
1423        );
1424    }
1425
1426    #[tokio::test]
1427    async fn cache_batch_buffer_compacts_when_buffered_size_exceeds_threshold() {
1428        let large_batch = make_large_binary_batch(DEFAULT_READ_BATCH_SIZE, 4096);
1429        let strategy = CacheStrategy::EnableAll(Arc::new(
1430            CacheManager::builder()
1431                .range_result_cache_size((large_batch.get_array_memory_size() * 3) as u64)
1432                .build(),
1433        ));
1434        let (key, part_metrics) = test_cache_context(&strategy);
1435
1436        let mut buffer = CacheBatchBuffer::new(&strategy);
1437        buffer.push(large_batch.clone()).unwrap();
1438
1439        assert_eq!(buffer.buffered_rows, large_batch.num_rows());
1440        assert_eq!(buffer.buffered_batches.len(), 1);
1441
1442        buffer.push(large_batch.clone()).unwrap();
1443
1444        assert_eq!(buffer.buffered_rows, 0);
1445        assert!(buffer.buffered_batches.is_empty());
1446
1447        let value = finish_cache_batch_buffer(buffer, key, strategy, part_metrics)
1448            .await
1449            .unwrap();
1450        assert_eq!(value.cached_batches.len(), 1);
1451        assert_eq!(
1452            value.cached_batches[0].slice_lengths,
1453            vec![large_batch.num_rows(), large_batch.num_rows()]
1454        );
1455    }
1456
1457    #[tokio::test]
1458    async fn cache_batch_buffer_skips_cache_when_compacted_size_exceeds_limit() {
1459        let large_batch = make_large_binary_batch(DEFAULT_READ_BATCH_SIZE / 2 + 1, 4096);
1460        // Budget only fits two large batches.
1461        let budget = (large_batch.get_array_memory_size() as u64) * 2 + 1;
1462        let strategy = CacheStrategy::EnableAll(Arc::new(
1463            CacheManager::builder()
1464                .range_result_cache_size(budget)
1465                .build(),
1466        ));
1467        let (key, part_metrics) = test_cache_context(&strategy);
1468
1469        let mut buffer = CacheBatchBuffer::new(&strategy);
1470        for _ in 0..4 {
1471            buffer.push(large_batch.clone()).unwrap();
1472        }
1473        assert!(
1474            finish_cache_batch_buffer(buffer, key.clone(), strategy.clone(), part_metrics)
1475                .await
1476                .is_err()
1477        );
1478        assert!(strategy.get_range_result(&key).is_none());
1479    }
1480}