Skip to main content

mito2/
cache.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Cache for the engine.
16
17pub(crate) mod cache_size;
18
19pub(crate) mod file_cache;
20pub(crate) mod index;
21pub(crate) mod manifest_cache;
22#[cfg(test)]
23pub(crate) mod test_util;
24pub(crate) mod write_cache;
25
26use std::collections::{BTreeMap, HashMap};
27use std::mem;
28use std::ops::Range;
29use std::sync::{Arc, RwLock};
30
31use bytes::Bytes;
32use common_base::readable_size::ReadableSize;
33use common_telemetry::warn;
34use datatypes::arrow::buffer::BooleanBuffer;
35use datatypes::arrow::record_batch::RecordBatch;
36use datatypes::value::Value;
37use datatypes::vectors::VectorRef;
38use index::bloom_filter_index::{BloomFilterIndexCache, BloomFilterIndexCacheRef};
39use index::result_cache::IndexResultCache;
40use moka::notification::RemovalCause;
41use moka::sync::Cache;
42use object_store::ObjectStore;
43use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
44use parquet::file::metadata::{FileMetaData, PageIndexPolicy, ParquetMetaData};
45use puffin::puffin_manager::cache::{PuffinMetadataCache, PuffinMetadataCacheRef};
46use smallvec::SmallVec;
47use snafu::{OptionExt, ResultExt};
48use store_api::metadata::RegionMetadataRef;
49use store_api::storage::{ConcreteDataType, FileId, RegionId, TimeSeriesRowSelector};
50
51use crate::cache::cache_size::parquet_meta_size;
52use crate::cache::file_cache::{FileType, IndexKey};
53use crate::cache::index::inverted_index::{InvertedIndexCache, InvertedIndexCacheRef};
54#[cfg(feature = "vector_index")]
55use crate::cache::index::vector_index::{VectorIndexCache, VectorIndexCacheRef};
56use crate::cache::write_cache::WriteCacheRef;
57use crate::error::{InvalidMetadataSnafu, InvalidParquetSnafu, Result, UnexpectedSnafu};
58use crate::memtable::record_batch_estimated_size;
59use crate::metrics::{CACHE_BYTES, CACHE_EVICTION, CACHE_HIT, CACHE_MISS};
60use crate::read::Batch;
61use crate::read::range_cache::{RangeScanCacheKey, RangeScanCacheValue};
62use crate::sst::file::{RegionFileId, RegionIndexId};
63use crate::sst::parquet::PARQUET_METADATA_KEY;
64use crate::sst::parquet::read_columns::ParquetReadColumns;
65use crate::sst::parquet::reader::MetadataCacheMetrics;
66
67/// Metrics type key for sst meta.
68const SST_META_TYPE: &str = "sst_meta";
69/// Metrics type key for vector.
70const VECTOR_TYPE: &str = "vector";
71/// Metrics type key for pages.
72const PAGE_TYPE: &str = "page";
73/// Metrics type key for files on the local store.
74const FILE_TYPE: &str = "file";
75/// Metrics type key for index files (puffin) on the local store.
76const INDEX_TYPE: &str = "index";
77/// Metrics type key for selector result cache.
78const SELECTOR_RESULT_TYPE: &str = "selector_result";
79/// Metrics type key for range scan result cache.
80const RANGE_RESULT_TYPE: &str = "range_result";
81/// Metrics type key for prefilter result cache.
82const PREFILTER_RESULT_TYPE: &str = "prefilter_result";
83const RANGE_RESULT_CONCAT_MEMORY_LIMIT: ReadableSize = ReadableSize::mb(512);
84const RANGE_RESULT_CONCAT_MEMORY_PERMIT: ReadableSize = ReadableSize::kb(1);
85
86#[derive(Debug)]
87pub(crate) struct RangeResultMemoryLimiter {
88    semaphore: Arc<tokio::sync::Semaphore>,
89    permit_bytes: usize,
90    total_permits: usize,
91}
92
93impl Default for RangeResultMemoryLimiter {
94    fn default() -> Self {
95        Self::new(
96            RANGE_RESULT_CONCAT_MEMORY_LIMIT.as_bytes() as usize,
97            RANGE_RESULT_CONCAT_MEMORY_PERMIT.as_bytes() as usize,
98        )
99    }
100}
101
102impl RangeResultMemoryLimiter {
103    pub(crate) fn new(limit_bytes: usize, permit_bytes: usize) -> Self {
104        let permit_bytes = permit_bytes.max(1);
105        let total_permits = limit_bytes
106            .div_ceil(permit_bytes)
107            .clamp(1, tokio::sync::Semaphore::MAX_PERMITS);
108        Self {
109            semaphore: Arc::new(tokio::sync::Semaphore::new(total_permits)),
110            permit_bytes,
111            total_permits,
112        }
113    }
114
115    #[cfg(test)]
116    pub(crate) fn permit_bytes(&self) -> usize {
117        self.permit_bytes
118    }
119
120    #[cfg(test)]
121    pub(crate) fn available_permits(&self) -> usize {
122        self.semaphore.available_permits()
123    }
124
125    pub(crate) async fn acquire(&self, bytes: usize) -> Result<tokio::sync::SemaphorePermit<'_>> {
126        let permits = bytes.div_ceil(self.permit_bytes).max(1);
127        if permits > self.total_permits {
128            return UnexpectedSnafu {
129                reason: format!(
130                    "range result memory request of {bytes} bytes exceeds limiter capacity of {} bytes",
131                    self.total_permits.saturating_mul(self.permit_bytes)
132                ),
133            }
134            .fail();
135        }
136        self.semaphore
137            .acquire_many(permits as u32)
138            .await
139            .map_err(|_| {
140                UnexpectedSnafu {
141                    reason: "range result memory limiter is unexpectedly closed",
142                }
143                .build()
144            })
145    }
146}
147
148/// Cached SST metadata combines the parquet footer with the decoded region metadata.
149///
150/// The cached parquet footer strips the `greptime:metadata` JSON payload and stores the decoded
151/// [RegionMetadata] separately so readers can skip repeated deserialization work.
152#[derive(Debug)]
153pub(crate) struct CachedSstMeta {
154    parquet_metadata: Arc<ParquetMetaData>,
155    region_metadata: RegionMetadataRef,
156    region_metadata_weight: usize,
157    page_index_policy: PageIndexPolicy,
158}
159
160impl CachedSstMeta {
161    #[cfg(test)]
162    pub(crate) fn try_new(file_path: &str, parquet_metadata: ParquetMetaData) -> Result<Self> {
163        let page_index_policy = infer_loaded_page_index_policy(&parquet_metadata);
164        Self::try_new_with_page_index_policy(file_path, parquet_metadata, None, page_index_policy)
165    }
166
167    pub(crate) fn try_new_with_region_metadata(
168        file_path: &str,
169        parquet_metadata: ParquetMetaData,
170        region_metadata: Option<RegionMetadataRef>,
171    ) -> Result<Self> {
172        let page_index_policy = infer_loaded_page_index_policy(&parquet_metadata);
173        Self::try_new_with_page_index_policy(
174            file_path,
175            parquet_metadata,
176            region_metadata,
177            page_index_policy,
178        )
179    }
180
181    pub(crate) fn try_new_with_page_index_policy(
182        file_path: &str,
183        parquet_metadata: ParquetMetaData,
184        region_metadata: Option<RegionMetadataRef>,
185        page_index_policy: PageIndexPolicy,
186    ) -> Result<Self> {
187        let file_metadata = parquet_metadata.file_metadata();
188        let key_values = file_metadata
189            .key_value_metadata()
190            .context(InvalidParquetSnafu {
191                file: file_path,
192                reason: "missing key value meta",
193            })?;
194        let meta_value = key_values
195            .iter()
196            .find(|kv| kv.key == PARQUET_METADATA_KEY)
197            .with_context(|| InvalidParquetSnafu {
198                file: file_path,
199                reason: format!("key {} not found", PARQUET_METADATA_KEY),
200            })?;
201        let json = meta_value
202            .value
203            .as_ref()
204            .with_context(|| InvalidParquetSnafu {
205                file: file_path,
206                reason: format!("No value for key {}", PARQUET_METADATA_KEY),
207            })?;
208        let region_metadata = match region_metadata {
209            Some(region_metadata) => region_metadata,
210            None => Arc::new(
211                store_api::metadata::RegionMetadata::from_json(json)
212                    .context(InvalidMetadataSnafu)?,
213            ),
214        };
215        // Keep the previous JSON-byte floor and charge the decoded structures as well.
216        let region_metadata_weight = region_metadata.estimated_size().max(json.len());
217        let parquet_metadata = Arc::new(strip_region_metadata_from_parquet(parquet_metadata));
218
219        Ok(Self {
220            parquet_metadata,
221            region_metadata,
222            region_metadata_weight,
223            page_index_policy,
224        })
225    }
226
227    pub(crate) fn parquet_metadata(&self) -> Arc<ParquetMetaData> {
228        self.parquet_metadata.clone()
229    }
230
231    pub(crate) fn region_metadata(&self) -> RegionMetadataRef {
232        self.region_metadata.clone()
233    }
234
235    fn satisfies_page_index_policy(&self, requested: PageIndexPolicy) -> bool {
236        match requested {
237            PageIndexPolicy::Skip => true,
238            PageIndexPolicy::Optional => self.page_index_policy != PageIndexPolicy::Skip,
239            PageIndexPolicy::Required => self.page_index_policy == PageIndexPolicy::Required,
240        }
241    }
242}
243
244fn infer_loaded_page_index_policy(parquet_metadata: &ParquetMetaData) -> PageIndexPolicy {
245    if parquet_metadata.column_index().is_some() || parquet_metadata.offset_index().is_some() {
246        PageIndexPolicy::Optional
247    } else {
248        PageIndexPolicy::Skip
249    }
250}
251
252fn strip_region_metadata_from_parquet(parquet_metadata: ParquetMetaData) -> ParquetMetaData {
253    let file_metadata = parquet_metadata.file_metadata();
254    let filtered_key_values = file_metadata.key_value_metadata().and_then(|key_values| {
255        let filtered = key_values
256            .iter()
257            .filter(|kv| kv.key != PARQUET_METADATA_KEY)
258            .cloned()
259            .collect::<Vec<_>>();
260        (!filtered.is_empty()).then_some(filtered)
261    });
262    let stripped_file_metadata = FileMetaData::new(
263        file_metadata.version(),
264        file_metadata.num_rows(),
265        file_metadata.created_by().map(ToString::to_string),
266        filtered_key_values,
267        file_metadata.schema_descr_ptr(),
268        file_metadata.column_orders().cloned(),
269    );
270
271    let mut builder = parquet_metadata.into_builder();
272    let row_groups = builder.take_row_groups();
273    let column_index = builder.take_column_index();
274    let offset_index = builder.take_offset_index();
275
276    parquet::file::metadata::ParquetMetaDataBuilder::new(stripped_file_metadata)
277        .set_row_groups(row_groups)
278        .set_column_index(column_index)
279        .set_offset_index(offset_index)
280        .build()
281}
282
283fn removal_cause_str(cause: RemovalCause) -> &'static str {
284    match cause {
285        RemovalCause::Expired => "expired",
286        RemovalCause::Explicit => "explicit",
287        RemovalCause::Replaced => "replaced",
288        RemovalCause::Size => "size",
289    }
290}
291
292#[derive(Debug, Clone, PartialEq, Eq, Hash)]
293pub(crate) struct PrefilterRowSelector {
294    row_count: usize,
295    skip: bool,
296}
297
298// `parquet::arrow::arrow_reader::RowSelector` does not implement `Hash`, but
299// prefilter cache keys must hash the upstream row-selection snapshot. Keep a
300// local hashable mirror of the two fields that define selector semantics.
301// TODO(yingwen): Remove this mirror if upstream `RowSelector` implements `Hash`.
302impl From<&RowSelector> for PrefilterRowSelector {
303    fn from(selector: &RowSelector) -> Self {
304        Self {
305            row_count: selector.row_count,
306            skip: selector.skip,
307        }
308    }
309}
310
311/// Key for a cached prefilter result.
312#[derive(Debug, Clone, PartialEq, Eq, Hash)]
313pub(crate) struct PrefilterKey {
314    file_id: FileId,
315    row_group_idx: u32,
316    row_selection: Option<Arc<Vec<PrefilterRowSelector>>>,
317    schema_version: u64,
318    filter_exprs: SmallVec<[String; 1]>,
319    mem_usage: usize,
320}
321
322impl PrefilterKey {
323    pub(crate) fn row_selection_snapshot(
324        row_selection: Option<&RowSelection>,
325    ) -> Option<Arc<Vec<PrefilterRowSelector>>> {
326        row_selection.map(|selection| {
327            Arc::new(
328                selection
329                    .iter()
330                    .map(PrefilterRowSelector::from)
331                    .collect::<Vec<_>>(),
332            )
333        })
334    }
335
336    pub(crate) fn new(
337        file_id: FileId,
338        row_group_idx: u32,
339        row_selection: Option<Arc<Vec<PrefilterRowSelector>>>,
340        schema_version: u64,
341        filter_exprs: SmallVec<[String; 1]>,
342    ) -> Self {
343        let row_selection_bytes = row_selection
344            .as_ref()
345            .map(|selection| selection.len() * mem::size_of::<PrefilterRowSelector>())
346            .unwrap_or(0);
347        let spilled_expr_bytes = if filter_exprs.spilled() {
348            filter_exprs.capacity() * mem::size_of::<String>()
349        } else {
350            0
351        };
352        let expr_bytes = filter_exprs.iter().map(|s| s.capacity()).sum::<usize>();
353
354        Self {
355            file_id,
356            row_group_idx,
357            row_selection,
358            schema_version,
359            filter_exprs,
360            mem_usage: mem::size_of::<Self>()
361                + row_selection_bytes
362                + spilled_expr_bytes
363                + expr_bytes,
364        }
365    }
366
367    fn mem_usage(&self) -> usize {
368        self.mem_usage
369    }
370}
371
372type PrefilterResultCache = Cache<PrefilterKey, Arc<BooleanBuffer>>;
373
374fn new_prefilter_result_cache(capacity: u64) -> PrefilterResultCache {
375    Cache::builder()
376        .max_capacity(capacity)
377        .weigher(prefilter_result_cache_weight)
378        .eviction_listener(|k, v, cause| {
379            let size = prefilter_result_cache_weight(&k, &v);
380            CACHE_BYTES
381                .with_label_values(&[PREFILTER_RESULT_TYPE])
382                .sub(size.into());
383            CACHE_EVICTION
384                .with_label_values(&[PREFILTER_RESULT_TYPE, removal_cause_str(cause)])
385                .inc();
386        })
387        .build()
388}
389
390fn prefilter_result_cache_weight(k: &PrefilterKey, v: &Arc<BooleanBuffer>) -> u32 {
391    (k.mem_usage() + mem::size_of::<BooleanBuffer>() + v.values().len()) as u32
392}
393
394/// Cache strategies that may only enable a subset of caches.
395#[derive(Clone)]
396pub enum CacheStrategy {
397    /// Strategy for normal operations.
398    /// Doesn't disable any cache.
399    EnableAll(CacheManagerRef),
400    /// Strategy for compaction.
401    /// Disables some caches during compaction to avoid affecting queries.
402    /// Enables the write cache so that the compaction can read files cached
403    /// in the write cache and write the compacted files back to the write cache.
404    Compaction(CacheManagerRef),
405    /// Do not use any cache.
406    Disabled,
407}
408
409impl CacheStrategy {
410    /// Gets fused SST metadata with cache metrics tracking.
411    pub(crate) async fn get_sst_meta_data(
412        &self,
413        file_id: RegionFileId,
414        metrics: &mut MetadataCacheMetrics,
415        page_index_policy: PageIndexPolicy,
416    ) -> Option<Arc<CachedSstMeta>> {
417        match self {
418            CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => {
419                cache_manager
420                    .get_sst_meta_data(file_id, metrics, page_index_policy)
421                    .await
422            }
423            CacheStrategy::Disabled => {
424                metrics.cache_miss += 1;
425                None
426            }
427        }
428    }
429
430    /// Calls [CacheManager::get_sst_meta_data_from_mem_cache()].
431    pub(crate) fn get_sst_meta_data_from_mem_cache(
432        &self,
433        file_id: RegionFileId,
434        page_index_policy: PageIndexPolicy,
435    ) -> Option<Arc<CachedSstMeta>> {
436        match self {
437            CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => {
438                cache_manager.get_sst_meta_data_from_mem_cache(file_id, page_index_policy)
439            }
440            CacheStrategy::Disabled => None,
441        }
442    }
443
444    /// Calls [CacheManager::get_parquet_meta_data_from_mem_cache()].
445    pub fn get_parquet_meta_data_from_mem_cache(
446        &self,
447        file_id: RegionFileId,
448    ) -> Option<Arc<ParquetMetaData>> {
449        self.get_sst_meta_data_from_mem_cache(file_id, PageIndexPolicy::Skip)
450            .map(|metadata| metadata.parquet_metadata())
451    }
452
453    /// Calls [CacheManager::put_sst_meta_data()].
454    pub(crate) fn put_sst_meta_data(&self, file_id: RegionFileId, metadata: Arc<CachedSstMeta>) {
455        match self {
456            CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => {
457                cache_manager.put_sst_meta_data(file_id, metadata);
458            }
459            CacheStrategy::Disabled => {}
460        }
461    }
462
463    /// Calls [CacheManager::put_parquet_meta_data()].
464    pub fn put_parquet_meta_data(
465        &self,
466        file_id: RegionFileId,
467        metadata: Arc<ParquetMetaData>,
468        region_metadata: Option<RegionMetadataRef>,
469    ) {
470        match self {
471            CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => {
472                cache_manager.put_parquet_meta_data(file_id, metadata, region_metadata);
473            }
474            CacheStrategy::Disabled => {}
475        }
476    }
477
478    /// Calls [CacheManager::get_prefilter_result()].
479    /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
480    pub(crate) fn get_prefilter_result(&self, key: &PrefilterKey) -> Option<Arc<BooleanBuffer>> {
481        match self {
482            CacheStrategy::EnableAll(cache_manager) => cache_manager.get_prefilter_result(key),
483            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
484        }
485    }
486
487    /// Calls [CacheManager::put_prefilter_result()].
488    /// It does nothing if the strategy isn't [CacheStrategy::EnableAll].
489    pub(crate) fn put_prefilter_result(&self, key: PrefilterKey, result: Arc<BooleanBuffer>) {
490        if let CacheStrategy::EnableAll(cache_manager) = self {
491            cache_manager.put_prefilter_result(key, result);
492        }
493    }
494
495    /// Calls [CacheManager::remove_parquet_meta_data()].
496    pub fn remove_parquet_meta_data(&self, file_id: RegionFileId) {
497        match self {
498            CacheStrategy::EnableAll(cache_manager) => {
499                cache_manager.remove_parquet_meta_data(file_id);
500            }
501            CacheStrategy::Compaction(cache_manager) => {
502                cache_manager.remove_parquet_meta_data(file_id);
503            }
504            CacheStrategy::Disabled => {}
505        }
506    }
507
508    /// Calls [CacheManager::get_repeated_vector()].
509    /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
510    pub fn get_repeated_vector(
511        &self,
512        data_type: &ConcreteDataType,
513        value: &Value,
514    ) -> Option<VectorRef> {
515        match self {
516            CacheStrategy::EnableAll(cache_manager) => {
517                cache_manager.get_repeated_vector(data_type, value)
518            }
519            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
520        }
521    }
522
523    /// Calls [CacheManager::put_repeated_vector()].
524    /// It does nothing if the strategy isn't [CacheStrategy::EnableAll].
525    pub fn put_repeated_vector(&self, value: Value, vector: VectorRef) {
526        if let CacheStrategy::EnableAll(cache_manager) = self {
527            cache_manager.put_repeated_vector(value, vector);
528        }
529    }
530
531    /// Calls [CacheManager::get_page_ranges()].
532    /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
533    pub fn get_page_ranges(
534        &self,
535        file_id: FileId,
536        row_group_idx: usize,
537        ranges: &[Range<u64>],
538    ) -> Option<PageRangeLookup> {
539        match self {
540            CacheStrategy::EnableAll(cache_manager) => {
541                cache_manager.get_page_ranges(file_id, row_group_idx, ranges)
542            }
543            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
544        }
545    }
546
547    /// Calls [CacheManager::put_page_ranges()].
548    /// It does nothing if the strategy isn't [CacheStrategy::EnableAll].
549    pub fn put_page_ranges(
550        &self,
551        file_id: FileId,
552        row_group_idx: usize,
553        ranges: &[Range<u64>],
554        pages: &[Bytes],
555    ) {
556        if let CacheStrategy::EnableAll(cache_manager) = self {
557            cache_manager.put_page_ranges(file_id, row_group_idx, ranges, pages);
558        }
559    }
560
561    /// Calls [CacheManager::evict_puffin_cache()].
562    pub async fn evict_puffin_cache(&self, file_id: RegionIndexId) {
563        match self {
564            CacheStrategy::EnableAll(cache_manager) => {
565                cache_manager.evict_puffin_cache(file_id).await
566            }
567            CacheStrategy::Compaction(cache_manager) => {
568                cache_manager.evict_puffin_cache(file_id).await
569            }
570            CacheStrategy::Disabled => {}
571        }
572    }
573
574    /// Calls [CacheManager::get_selector_result()].
575    /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
576    pub fn get_selector_result(
577        &self,
578        selector_key: &SelectorResultKey,
579    ) -> Option<Arc<SelectorResultValue>> {
580        match self {
581            CacheStrategy::EnableAll(cache_manager) => {
582                cache_manager.get_selector_result(selector_key)
583            }
584            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
585        }
586    }
587
588    /// Calls [CacheManager::put_selector_result()].
589    /// It does nothing if the strategy isn't [CacheStrategy::EnableAll].
590    pub fn put_selector_result(
591        &self,
592        selector_key: SelectorResultKey,
593        result: Arc<SelectorResultValue>,
594    ) {
595        if let CacheStrategy::EnableAll(cache_manager) = self {
596            cache_manager.put_selector_result(selector_key, result);
597        }
598    }
599
600    /// Calls [CacheManager::get_range_result()].
601    /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
602    #[allow(dead_code)]
603    pub(crate) fn get_range_result(
604        &self,
605        key: &RangeScanCacheKey,
606    ) -> Option<Arc<RangeScanCacheValue>> {
607        match self {
608            CacheStrategy::EnableAll(cache_manager) => cache_manager.get_range_result(key),
609            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
610        }
611    }
612
613    /// Calls [CacheManager::put_range_result()].
614    /// It does nothing if the strategy isn't [CacheStrategy::EnableAll].
615    pub(crate) fn put_range_result(
616        &self,
617        key: RangeScanCacheKey,
618        result: Arc<RangeScanCacheValue>,
619    ) {
620        if let CacheStrategy::EnableAll(cache_manager) = self {
621            cache_manager.put_range_result(key, result);
622        }
623    }
624
625    /// Returns true if the range result cache is enabled.
626    pub(crate) fn has_range_result_cache(&self) -> bool {
627        match self {
628            CacheStrategy::EnableAll(cache_manager) => cache_manager.has_range_result_cache(),
629            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => false,
630        }
631    }
632
633    pub(crate) fn range_result_memory_limiter(&self) -> Option<&Arc<RangeResultMemoryLimiter>> {
634        match self {
635            CacheStrategy::EnableAll(cache_manager) => {
636                Some(cache_manager.range_result_memory_limiter())
637            }
638            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
639        }
640    }
641
642    pub(crate) fn range_result_cache_size(&self) -> Option<usize> {
643        match self {
644            CacheStrategy::EnableAll(cache_manager) => {
645                Some(cache_manager.range_result_cache_size())
646            }
647            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
648        }
649    }
650
651    /// Calls [CacheManager::write_cache()].
652    /// It returns None if the strategy is [CacheStrategy::Disabled].
653    pub fn write_cache(&self) -> Option<&WriteCacheRef> {
654        match self {
655            CacheStrategy::EnableAll(cache_manager) => cache_manager.write_cache(),
656            CacheStrategy::Compaction(cache_manager) => cache_manager.write_cache(),
657            CacheStrategy::Disabled => None,
658        }
659    }
660
661    /// Calls [CacheManager::index_cache()].
662    /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
663    pub fn inverted_index_cache(&self) -> Option<&InvertedIndexCacheRef> {
664        match self {
665            CacheStrategy::EnableAll(cache_manager) => cache_manager.inverted_index_cache(),
666            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
667        }
668    }
669
670    /// Calls [CacheManager::bloom_filter_index_cache()].
671    /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
672    pub fn bloom_filter_index_cache(&self) -> Option<&BloomFilterIndexCacheRef> {
673        match self {
674            CacheStrategy::EnableAll(cache_manager) => cache_manager.bloom_filter_index_cache(),
675            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
676        }
677    }
678
679    /// Calls [CacheManager::vector_index_cache()].
680    /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
681    #[cfg(feature = "vector_index")]
682    pub fn vector_index_cache(&self) -> Option<&VectorIndexCacheRef> {
683        match self {
684            CacheStrategy::EnableAll(cache_manager) => cache_manager.vector_index_cache(),
685            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
686        }
687    }
688
689    /// Calls [CacheManager::puffin_metadata_cache()].
690    /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
691    pub fn puffin_metadata_cache(&self) -> Option<&PuffinMetadataCacheRef> {
692        match self {
693            CacheStrategy::EnableAll(cache_manager) => cache_manager.puffin_metadata_cache(),
694            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
695        }
696    }
697
698    /// Calls [CacheManager::index_result_cache()].
699    /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
700    pub fn index_result_cache(&self) -> Option<&IndexResultCache> {
701        match self {
702            CacheStrategy::EnableAll(cache_manager) => cache_manager.index_result_cache(),
703            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
704        }
705    }
706
707    /// Triggers download if the strategy is [CacheStrategy::EnableAll] and write cache is available.
708    pub fn maybe_download_background(
709        &self,
710        index_key: IndexKey,
711        remote_path: String,
712        remote_store: ObjectStore,
713        file_size: u64,
714    ) {
715        if let CacheStrategy::EnableAll(cache_manager) = self
716            && let Some(write_cache) = cache_manager.write_cache()
717        {
718            write_cache.file_cache().maybe_download_background(
719                index_key,
720                remote_path,
721                remote_store,
722                file_size,
723            );
724        }
725    }
726}
727
728/// Manages cached data for the engine.
729///
730/// All caches are disabled by default.
731#[derive(Default)]
732pub struct CacheManager {
733    /// Cache for SST metadata.
734    sst_meta_cache: Option<SstMetaCache>,
735    /// Cache for vectors.
736    vector_cache: Option<VectorCache>,
737    /// Cache for SST byte ranges.
738    page_cache: Option<Arc<PageRangeCache>>,
739    /// A Cache for writing files to object stores.
740    write_cache: Option<WriteCacheRef>,
741    /// Cache for inverted index.
742    inverted_index_cache: Option<InvertedIndexCacheRef>,
743    /// Cache for bloom filter index.
744    bloom_filter_index_cache: Option<BloomFilterIndexCacheRef>,
745    /// Cache for vector index.
746    #[cfg(feature = "vector_index")]
747    vector_index_cache: Option<VectorIndexCacheRef>,
748    /// Puffin metadata cache.
749    puffin_metadata_cache: Option<PuffinMetadataCacheRef>,
750    /// Cache for time series selectors.
751    selector_result_cache: Option<SelectorResultCache>,
752    /// Cache for range scan outputs in flat format.
753    range_result_cache: Option<RangeResultCache>,
754    /// Configured capacity for range scan outputs in flat format.
755    range_result_cache_size: u64,
756    /// Shared memory limiter for async range-result cache tasks.
757    range_result_memory_limiter: Arc<RangeResultMemoryLimiter>,
758    /// Cache for index result.
759    index_result_cache: Option<IndexResultCache>,
760    /// Cache for prefilter result.
761    prefilter_result_cache: Option<PrefilterResultCache>,
762}
763
764pub type CacheManagerRef = Arc<CacheManager>;
765
766impl CacheManager {
767    /// Returns a builder to build the cache.
768    pub fn builder() -> CacheManagerBuilder {
769        CacheManagerBuilder::default()
770    }
771
772    /// Gets fused SST metadata with metrics tracking.
773    /// Tries in-memory cache first, then file cache, updating metrics accordingly.
774    pub(crate) async fn get_sst_meta_data(
775        &self,
776        file_id: RegionFileId,
777        metrics: &mut MetadataCacheMetrics,
778        page_index_policy: PageIndexPolicy,
779    ) -> Option<Arc<CachedSstMeta>> {
780        if let Some(metadata) = self.get_sst_meta_data_from_mem_cache(file_id, page_index_policy) {
781            metrics.mem_cache_hit += 1;
782            return Some(metadata);
783        }
784
785        let key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Parquet);
786        if let Some(write_cache) = &self.write_cache
787            && let Some(metadata) = write_cache
788                .file_cache()
789                .get_sst_meta_data(key, metrics, page_index_policy)
790                .await
791        {
792            metrics.file_cache_hit += 1;
793            self.put_sst_meta_data(file_id, metadata.clone());
794            return Some(metadata);
795        }
796
797        metrics.cache_miss += 1;
798        None
799    }
800
801    /// Gets cached [ParquetMetaData] with metrics tracking.
802    /// Tries in-memory cache first, then file cache, updating metrics accordingly.
803    pub(crate) async fn get_parquet_meta_data(
804        &self,
805        file_id: RegionFileId,
806        metrics: &mut MetadataCacheMetrics,
807        page_index_policy: PageIndexPolicy,
808    ) -> Option<Arc<ParquetMetaData>> {
809        self.get_sst_meta_data(file_id, metrics, page_index_policy)
810            .await
811            .map(|metadata| metadata.parquet_metadata())
812    }
813
814    /// Gets cached fused SST metadata from in-memory cache.
815    /// This method does not perform I/O.
816    pub(crate) fn get_sst_meta_data_from_mem_cache(
817        &self,
818        file_id: RegionFileId,
819        page_index_policy: PageIndexPolicy,
820    ) -> Option<Arc<CachedSstMeta>> {
821        self.sst_meta_cache.as_ref().and_then(|sst_meta_cache| {
822            let value = sst_meta_cache.get(&SstMetaKey(file_id.region_id(), file_id.file_id()));
823            let value =
824                value.filter(|metadata| metadata.satisfies_page_index_policy(page_index_policy));
825            update_hit_miss(value, SST_META_TYPE)
826        })
827    }
828
829    /// Gets cached [ParquetMetaData] from in-memory cache.
830    /// This method does not perform I/O.
831    pub fn get_parquet_meta_data_from_mem_cache(
832        &self,
833        file_id: RegionFileId,
834    ) -> Option<Arc<ParquetMetaData>> {
835        self.get_sst_meta_data_from_mem_cache(file_id, PageIndexPolicy::Skip)
836            .map(|metadata| metadata.parquet_metadata())
837    }
838
839    /// Puts fused SST metadata into the cache.
840    pub(crate) fn put_sst_meta_data(&self, file_id: RegionFileId, metadata: Arc<CachedSstMeta>) {
841        if let Some(cache) = &self.sst_meta_cache {
842            let key = SstMetaKey(file_id.region_id(), file_id.file_id());
843            CACHE_BYTES
844                .with_label_values(&[SST_META_TYPE])
845                .add(meta_cache_weight(&key, &metadata).into());
846            cache.insert(key, metadata);
847        }
848    }
849
850    /// Puts [ParquetMetaData] into the cache.
851    pub fn put_parquet_meta_data(
852        &self,
853        file_id: RegionFileId,
854        metadata: Arc<ParquetMetaData>,
855        region_metadata: Option<RegionMetadataRef>,
856    ) {
857        if self.sst_meta_cache.is_some() {
858            let file_path = format!(
859                "region_id={}, file_id={}",
860                file_id.region_id(),
861                file_id.file_id()
862            );
863            match CachedSstMeta::try_new_with_region_metadata(
864                &file_path,
865                Arc::unwrap_or_clone(metadata),
866                region_metadata,
867            ) {
868                Ok(metadata) => self.put_sst_meta_data(file_id, Arc::new(metadata)),
869                Err(err) => warn!(
870                    err; "Failed to decode region metadata while caching parquet metadata, region_id: {}, file_id: {}",
871                    file_id.region_id(),
872                    file_id.file_id()
873                ),
874            }
875        }
876    }
877
878    /// Removes [ParquetMetaData] from the cache.
879    pub fn remove_parquet_meta_data(&self, file_id: RegionFileId) {
880        if let Some(cache) = &self.sst_meta_cache {
881            cache.remove(&SstMetaKey(file_id.region_id(), file_id.file_id()));
882        }
883    }
884
885    /// Returns the total weighted size of the in-memory SST meta cache.
886    pub(crate) fn sst_meta_cache_weighted_size(&self) -> u64 {
887        self.sst_meta_cache
888            .as_ref()
889            .map(|cache| cache.weighted_size())
890            .unwrap_or(0)
891    }
892
893    /// Returns true if the in-memory SST meta cache is enabled.
894    pub(crate) fn sst_meta_cache_enabled(&self) -> bool {
895        self.sst_meta_cache.is_some()
896    }
897
898    /// Gets a vector with repeated value for specific `key`.
899    pub fn get_repeated_vector(
900        &self,
901        data_type: &ConcreteDataType,
902        value: &Value,
903    ) -> Option<VectorRef> {
904        self.vector_cache.as_ref().and_then(|vector_cache| {
905            let value = vector_cache.get(&(data_type.clone(), value.clone()));
906            update_hit_miss(value, VECTOR_TYPE)
907        })
908    }
909
910    /// Puts a vector with repeated value into the cache.
911    pub fn put_repeated_vector(&self, value: Value, vector: VectorRef) {
912        if let Some(cache) = &self.vector_cache {
913            let key = (vector.data_type(), value);
914            CACHE_BYTES
915                .with_label_values(&[VECTOR_TYPE])
916                .add(vector_cache_weight(&key, &vector).into());
917            cache.insert(key, vector);
918        }
919    }
920
921    /// Gets cached byte fragments for the requested ranges.
922    pub fn get_page_ranges(
923        &self,
924        file_id: FileId,
925        row_group_idx: usize,
926        ranges: &[Range<u64>],
927    ) -> Option<PageRangeLookup> {
928        self.page_cache.as_ref().map(|page_cache| {
929            let lookup = page_cache.lookup(file_id, row_group_idx, ranges);
930            if lookup.cached_bytes > 0 {
931                CACHE_HIT.with_label_values(&[PAGE_TYPE]).inc();
932            }
933            if !lookup.missing_ranges.is_empty() {
934                CACHE_MISS.with_label_values(&[PAGE_TYPE]).inc();
935            }
936            lookup
937        })
938    }
939
940    /// Puts byte fragments into the page cache.
941    pub fn put_page_ranges(
942        &self,
943        file_id: FileId,
944        row_group_idx: usize,
945        ranges: &[Range<u64>],
946        pages: &[Bytes],
947    ) {
948        if let Some(cache) = &self.page_cache {
949            cache.insert_ranges(file_id, row_group_idx, ranges, pages);
950        }
951    }
952
953    /// Evicts every puffin-related cache entry for the given file.
954    pub async fn evict_puffin_cache(&self, file_id: RegionIndexId) {
955        if let Some(cache) = &self.bloom_filter_index_cache {
956            cache.invalidate_file(file_id.file_id());
957        }
958
959        if let Some(cache) = &self.inverted_index_cache {
960            cache.invalidate_file(file_id.file_id());
961        }
962
963        if let Some(cache) = &self.index_result_cache {
964            cache.invalidate_file(file_id.file_id());
965        }
966
967        #[cfg(feature = "vector_index")]
968        if let Some(cache) = &self.vector_index_cache {
969            cache.invalidate_file(file_id.file_id());
970        }
971
972        if let Some(cache) = &self.puffin_metadata_cache {
973            cache.remove(&file_id.to_string());
974        }
975
976        if let Some(write_cache) = &self.write_cache {
977            write_cache
978                .remove(IndexKey::new(
979                    file_id.region_id(),
980                    file_id.file_id(),
981                    FileType::Puffin(file_id.version),
982                ))
983                .await;
984        }
985    }
986
987    /// Gets result of for the selector.
988    pub fn get_selector_result(
989        &self,
990        selector_key: &SelectorResultKey,
991    ) -> Option<Arc<SelectorResultValue>> {
992        self.selector_result_cache
993            .as_ref()
994            .and_then(|selector_result_cache| selector_result_cache.get(selector_key))
995    }
996
997    /// Puts result of the selector into the cache.
998    pub fn put_selector_result(
999        &self,
1000        selector_key: SelectorResultKey,
1001        result: Arc<SelectorResultValue>,
1002    ) {
1003        if let Some(cache) = &self.selector_result_cache {
1004            CACHE_BYTES
1005                .with_label_values(&[SELECTOR_RESULT_TYPE])
1006                .add(selector_result_cache_weight(&selector_key, &result).into());
1007            cache.insert(selector_key, result);
1008        }
1009    }
1010
1011    /// Gets cached result for range scan.
1012    #[allow(dead_code)]
1013    pub(crate) fn get_range_result(
1014        &self,
1015        key: &RangeScanCacheKey,
1016    ) -> Option<Arc<RangeScanCacheValue>> {
1017        self.range_result_cache
1018            .as_ref()
1019            .and_then(|cache| update_hit_miss(cache.get(key), RANGE_RESULT_TYPE))
1020    }
1021
1022    /// Puts range scan result into cache.
1023    pub(crate) fn put_range_result(
1024        &self,
1025        key: RangeScanCacheKey,
1026        result: Arc<RangeScanCacheValue>,
1027    ) {
1028        if let Some(cache) = &self.range_result_cache {
1029            CACHE_BYTES
1030                .with_label_values(&[RANGE_RESULT_TYPE])
1031                .add(range_result_cache_weight(&key, &result).into());
1032            cache.insert(key, result);
1033        }
1034    }
1035
1036    /// Returns true if the range result cache is enabled.
1037    pub(crate) fn has_range_result_cache(&self) -> bool {
1038        self.range_result_cache.is_some()
1039    }
1040
1041    pub(crate) fn range_result_memory_limiter(&self) -> &Arc<RangeResultMemoryLimiter> {
1042        &self.range_result_memory_limiter
1043    }
1044
1045    pub(crate) fn range_result_cache_size(&self) -> usize {
1046        self.range_result_cache_size as usize
1047    }
1048
1049    /// Gets the write cache.
1050    pub(crate) fn write_cache(&self) -> Option<&WriteCacheRef> {
1051        self.write_cache.as_ref()
1052    }
1053
1054    pub(crate) fn inverted_index_cache(&self) -> Option<&InvertedIndexCacheRef> {
1055        self.inverted_index_cache.as_ref()
1056    }
1057
1058    pub(crate) fn bloom_filter_index_cache(&self) -> Option<&BloomFilterIndexCacheRef> {
1059        self.bloom_filter_index_cache.as_ref()
1060    }
1061
1062    #[cfg(feature = "vector_index")]
1063    pub(crate) fn vector_index_cache(&self) -> Option<&VectorIndexCacheRef> {
1064        self.vector_index_cache.as_ref()
1065    }
1066
1067    pub(crate) fn puffin_metadata_cache(&self) -> Option<&PuffinMetadataCacheRef> {
1068        self.puffin_metadata_cache.as_ref()
1069    }
1070
1071    pub(crate) fn index_result_cache(&self) -> Option<&IndexResultCache> {
1072        self.index_result_cache.as_ref()
1073    }
1074
1075    pub(crate) fn get_prefilter_result(&self, key: &PrefilterKey) -> Option<Arc<BooleanBuffer>> {
1076        self.prefilter_result_cache
1077            .as_ref()
1078            .and_then(|cache| update_hit_miss(cache.get(key), PREFILTER_RESULT_TYPE))
1079    }
1080
1081    pub(crate) fn put_prefilter_result(&self, key: PrefilterKey, result: Arc<BooleanBuffer>) {
1082        if let Some(cache) = &self.prefilter_result_cache {
1083            CACHE_BYTES
1084                .with_label_values(&[PREFILTER_RESULT_TYPE])
1085                .add(prefilter_result_cache_weight(&key, &result).into());
1086            cache.insert(key, result);
1087        }
1088    }
1089}
1090
1091/// Increases selector cache miss metrics.
1092pub fn selector_result_cache_miss() {
1093    CACHE_MISS.with_label_values(&[SELECTOR_RESULT_TYPE]).inc()
1094}
1095
1096/// Increases selector cache hit metrics.
1097pub fn selector_result_cache_hit() {
1098    CACHE_HIT.with_label_values(&[SELECTOR_RESULT_TYPE]).inc()
1099}
1100
1101/// Builder to construct a [CacheManager].
1102#[derive(Default)]
1103pub struct CacheManagerBuilder {
1104    sst_meta_cache_size: u64,
1105    vector_cache_size: u64,
1106    page_cache_size: u64,
1107    index_metadata_size: u64,
1108    index_content_size: u64,
1109    index_content_page_size: u64,
1110    index_result_cache_size: u64,
1111    prefilter_result_cache_size: u64,
1112    puffin_metadata_size: u64,
1113    write_cache: Option<WriteCacheRef>,
1114    selector_result_cache_size: u64,
1115    range_result_cache_size: u64,
1116}
1117
1118impl CacheManagerBuilder {
1119    /// Sets meta cache size.
1120    pub fn sst_meta_cache_size(mut self, bytes: u64) -> Self {
1121        self.sst_meta_cache_size = bytes;
1122        self
1123    }
1124
1125    /// Sets vector cache size.
1126    pub fn vector_cache_size(mut self, bytes: u64) -> Self {
1127        self.vector_cache_size = bytes;
1128        self
1129    }
1130
1131    /// Sets page cache size.
1132    pub fn page_cache_size(mut self, bytes: u64) -> Self {
1133        self.page_cache_size = bytes;
1134        self
1135    }
1136
1137    /// Sets write cache.
1138    pub fn write_cache(mut self, cache: Option<WriteCacheRef>) -> Self {
1139        self.write_cache = cache;
1140        self
1141    }
1142
1143    /// Sets cache size for index metadata.
1144    pub fn index_metadata_size(mut self, bytes: u64) -> Self {
1145        self.index_metadata_size = bytes;
1146        self
1147    }
1148
1149    /// Sets cache size for index content.
1150    pub fn index_content_size(mut self, bytes: u64) -> Self {
1151        self.index_content_size = bytes;
1152        self
1153    }
1154
1155    /// Sets page size for index content.
1156    pub fn index_content_page_size(mut self, bytes: u64) -> Self {
1157        self.index_content_page_size = bytes;
1158        self
1159    }
1160
1161    /// Sets cache size for index result.
1162    pub fn index_result_cache_size(mut self, bytes: u64) -> Self {
1163        self.index_result_cache_size = bytes;
1164        self
1165    }
1166
1167    /// Sets cache size for prefilter result.
1168    pub fn prefilter_result_cache_size(mut self, bytes: u64) -> Self {
1169        self.prefilter_result_cache_size = bytes;
1170        self
1171    }
1172
1173    /// Sets cache size for puffin metadata.
1174    pub fn puffin_metadata_size(mut self, bytes: u64) -> Self {
1175        self.puffin_metadata_size = bytes;
1176        self
1177    }
1178
1179    /// Sets selector result cache size.
1180    pub fn selector_result_cache_size(mut self, bytes: u64) -> Self {
1181        self.selector_result_cache_size = bytes;
1182        self
1183    }
1184
1185    /// Sets range result cache size.
1186    pub fn range_result_cache_size(mut self, bytes: u64) -> Self {
1187        self.range_result_cache_size = bytes;
1188        self
1189    }
1190
1191    /// Builds the [CacheManager].
1192    pub fn build(self) -> CacheManager {
1193        let sst_meta_cache = (self.sst_meta_cache_size != 0).then(|| {
1194            Cache::builder()
1195                .max_capacity(self.sst_meta_cache_size)
1196                .weigher(meta_cache_weight)
1197                .eviction_listener(|k, v, cause| {
1198                    let size = meta_cache_weight(&k, &v);
1199                    CACHE_BYTES
1200                        .with_label_values(&[SST_META_TYPE])
1201                        .sub(size.into());
1202                    CACHE_EVICTION
1203                        .with_label_values(&[SST_META_TYPE, removal_cause_str(cause)])
1204                        .inc();
1205                })
1206                .build()
1207        });
1208        let vector_cache = (self.vector_cache_size != 0).then(|| {
1209            Cache::builder()
1210                .max_capacity(self.vector_cache_size)
1211                .weigher(vector_cache_weight)
1212                .eviction_listener(|k, v, cause| {
1213                    let size = vector_cache_weight(&k, &v);
1214                    CACHE_BYTES
1215                        .with_label_values(&[VECTOR_TYPE])
1216                        .sub(size.into());
1217                    CACHE_EVICTION
1218                        .with_label_values(&[VECTOR_TYPE, removal_cause_str(cause)])
1219                        .inc();
1220                })
1221                .build()
1222        });
1223        let page_cache =
1224            (self.page_cache_size != 0).then(|| PageRangeCache::new(self.page_cache_size));
1225        let inverted_index_cache = InvertedIndexCache::new(
1226            self.index_metadata_size,
1227            self.index_content_size,
1228            self.index_content_page_size,
1229        );
1230        // TODO(ruihang): check if it's ok to reuse the same param with inverted index
1231        let bloom_filter_index_cache = BloomFilterIndexCache::new(
1232            self.index_metadata_size,
1233            self.index_content_size,
1234            self.index_content_page_size,
1235        );
1236        #[cfg(feature = "vector_index")]
1237        let vector_index_cache = (self.index_content_size != 0)
1238            .then(|| Arc::new(VectorIndexCache::new(self.index_content_size)));
1239        let index_result_cache = (self.index_result_cache_size != 0)
1240            .then(|| IndexResultCache::new(self.index_result_cache_size));
1241        let prefilter_result_cache = (self.prefilter_result_cache_size != 0)
1242            .then(|| new_prefilter_result_cache(self.prefilter_result_cache_size));
1243        let puffin_metadata_cache =
1244            PuffinMetadataCache::new(self.puffin_metadata_size, &CACHE_BYTES);
1245        let selector_result_cache = (self.selector_result_cache_size != 0).then(|| {
1246            Cache::builder()
1247                .max_capacity(self.selector_result_cache_size)
1248                .weigher(selector_result_cache_weight)
1249                .eviction_listener(|k, v, cause| {
1250                    let size = selector_result_cache_weight(&k, &v);
1251                    CACHE_BYTES
1252                        .with_label_values(&[SELECTOR_RESULT_TYPE])
1253                        .sub(size.into());
1254                    CACHE_EVICTION
1255                        .with_label_values(&[SELECTOR_RESULT_TYPE, removal_cause_str(cause)])
1256                        .inc();
1257                })
1258                .build()
1259        });
1260        let range_result_cache = (self.range_result_cache_size != 0).then(|| {
1261            Cache::builder()
1262                .max_capacity(self.range_result_cache_size)
1263                .weigher(range_result_cache_weight)
1264                .eviction_listener(move |k, v, cause| {
1265                    let size = range_result_cache_weight(&k, &v);
1266                    CACHE_BYTES
1267                        .with_label_values(&[RANGE_RESULT_TYPE])
1268                        .sub(size.into());
1269                    CACHE_EVICTION
1270                        .with_label_values(&[RANGE_RESULT_TYPE, removal_cause_str(cause)])
1271                        .inc();
1272                })
1273                .build()
1274        });
1275        CacheManager {
1276            sst_meta_cache,
1277            vector_cache,
1278            page_cache,
1279            write_cache: self.write_cache,
1280            inverted_index_cache: Some(Arc::new(inverted_index_cache)),
1281            bloom_filter_index_cache: Some(Arc::new(bloom_filter_index_cache)),
1282            #[cfg(feature = "vector_index")]
1283            vector_index_cache,
1284            puffin_metadata_cache: Some(Arc::new(puffin_metadata_cache)),
1285            selector_result_cache,
1286            range_result_cache,
1287            range_result_cache_size: self.range_result_cache_size,
1288            range_result_memory_limiter: Arc::new(RangeResultMemoryLimiter::new(
1289                self.range_result_cache_size as usize,
1290                RANGE_RESULT_CONCAT_MEMORY_PERMIT.as_bytes() as usize,
1291            )),
1292            index_result_cache,
1293            prefilter_result_cache,
1294        }
1295    }
1296}
1297
1298fn meta_cache_weight(k: &SstMetaKey, v: &Arc<CachedSstMeta>) -> u32 {
1299    // We ignore the size of `Arc`.
1300    (k.estimated_size() + parquet_meta_size(&v.parquet_metadata) + v.region_metadata_weight) as u32
1301}
1302
1303fn vector_cache_weight(_k: &(ConcreteDataType, Value), v: &VectorRef) -> u32 {
1304    // We ignore the heap size of `Value`.
1305    (mem::size_of::<ConcreteDataType>() + mem::size_of::<Value>() + v.memory_size()) as u32
1306}
1307
1308fn page_cache_weight(k: &PageFragmentKey, v: &Bytes) -> u32 {
1309    (k.estimated_size() + mem::size_of::<Bytes>() + v.len()) as u32
1310}
1311
1312fn selector_result_cache_weight(k: &SelectorResultKey, v: &Arc<SelectorResultValue>) -> u32 {
1313    (mem::size_of_val(k) + v.estimated_size()) as u32
1314}
1315
1316fn range_result_cache_weight(k: &RangeScanCacheKey, v: &Arc<RangeScanCacheValue>) -> u32 {
1317    (k.estimated_size() + v.estimated_size()) as u32
1318}
1319
1320/// Updates cache hit/miss metrics.
1321fn update_hit_miss<T>(value: Option<T>, cache_type: &str) -> Option<T> {
1322    if value.is_some() {
1323        CACHE_HIT.with_label_values(&[cache_type]).inc();
1324    } else {
1325        CACHE_MISS.with_label_values(&[cache_type]).inc();
1326    }
1327    value
1328}
1329
1330/// Cache key (region id, file id) for SST meta.
1331#[derive(Debug, Clone, PartialEq, Eq, Hash)]
1332struct SstMetaKey(RegionId, FileId);
1333
1334impl SstMetaKey {
1335    /// Returns memory used by the key (estimated).
1336    fn estimated_size(&self) -> usize {
1337        mem::size_of::<Self>()
1338    }
1339}
1340
1341#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1342struct PageFragmentGroupKey {
1343    file_id: FileId,
1344    row_group_idx: usize,
1345}
1346
1347/// Cache key for one byte fragment in an SST row group.
1348#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1349pub struct PageFragmentKey {
1350    /// Id of the SST file.
1351    file_id: FileId,
1352    /// Index of the row group.
1353    row_group_idx: usize,
1354    /// Start offset of the cached byte fragment.
1355    start: u64,
1356    /// End offset of the cached byte fragment.
1357    end: u64,
1358}
1359
1360impl PageFragmentKey {
1361    fn new(file_id: FileId, row_group_idx: usize, range: &Range<u64>) -> PageFragmentKey {
1362        PageFragmentKey {
1363            file_id,
1364            row_group_idx,
1365            start: range.start,
1366            end: range.end,
1367        }
1368    }
1369
1370    fn group_key(&self) -> PageFragmentGroupKey {
1371        PageFragmentGroupKey {
1372            file_id: self.file_id,
1373            row_group_idx: self.row_group_idx,
1374        }
1375    }
1376
1377    /// Returns memory used by the key (estimated).
1378    fn estimated_size(&self) -> usize {
1379        mem::size_of::<Self>()
1380    }
1381}
1382
1383/// One cached byte fragment that overlaps a requested range.
1384#[derive(Clone)]
1385pub struct PageRangePart {
1386    /// Range covered by `bytes`.
1387    pub range: Range<u64>,
1388    /// Bytes for `range`.
1389    pub bytes: Bytes,
1390}
1391
1392/// Result of looking up request ranges in the page range cache.
1393pub struct PageRangeLookup {
1394    /// Cached fragments grouped by the original requested range index.
1395    pub cached_parts: Vec<Vec<PageRangePart>>,
1396    /// Ranges that are not covered by cached fragments and need fetching.
1397    pub missing_ranges: Vec<Range<u64>>,
1398    /// Number of cached fragments used.
1399    pub cached_range_count: usize,
1400    /// Number of requested bytes served from cached fragments.
1401    pub cached_bytes: u64,
1402}
1403
1404impl PageRangeLookup {
1405    pub fn is_fully_cached(&self) -> bool {
1406        self.missing_ranges.is_empty()
1407    }
1408}
1409
1410type PageFragmentRangeIndex = BTreeMap<(u64, u64), PageFragmentKey>;
1411type PageFragmentIndex = HashMap<PageFragmentGroupKey, PageFragmentRangeIndex>;
1412
1413/// Byte-fragment cache for Parquet row-group reads.
1414pub struct PageRangeCache {
1415    cache: Cache<PageFragmentKey, Bytes>,
1416    index: RwLock<PageFragmentIndex>,
1417}
1418
1419impl PageRangeCache {
1420    fn new(capacity: u64) -> Arc<PageRangeCache> {
1421        Arc::new_cyclic(|weak_cache: &std::sync::Weak<PageRangeCache>| {
1422            let cache = Cache::builder()
1423                .max_capacity(capacity)
1424                .weigher(page_cache_weight)
1425                .eviction_listener({
1426                    let weak_cache = weak_cache.clone();
1427                    move |k, v, cause| {
1428                        let size = page_cache_weight(&k, &v);
1429                        CACHE_BYTES.with_label_values(&[PAGE_TYPE]).sub(size.into());
1430                        CACHE_EVICTION
1431                            .with_label_values(&[PAGE_TYPE, removal_cause_str(cause)])
1432                            .inc();
1433
1434                        if let Some(cache) = weak_cache.upgrade()
1435                            && !matches!(cause, RemovalCause::Replaced)
1436                        {
1437                            cache.remove_index_entry(*k);
1438                        }
1439                    }
1440                })
1441                .build();
1442
1443            PageRangeCache {
1444                cache,
1445                index: RwLock::new(HashMap::new()),
1446            }
1447        })
1448    }
1449
1450    fn lookup(
1451        &self,
1452        file_id: FileId,
1453        row_group_idx: usize,
1454        ranges: &[Range<u64>],
1455    ) -> PageRangeLookup {
1456        let mut cached_parts = Vec::with_capacity(ranges.len());
1457        let mut missing_ranges = Vec::new();
1458        let mut cached_range_count = 0;
1459        let mut cached_bytes = 0;
1460
1461        for range in ranges {
1462            if range.start >= range.end {
1463                cached_parts.push(Vec::new());
1464                continue;
1465            }
1466
1467            let mut parts = Vec::new();
1468            let candidates = self.find_index_candidates(file_id, row_group_idx, range);
1469            let mut stale_keys = Vec::new();
1470
1471            for fragment_key in candidates {
1472                if let Some(bytes) = self.cache.get(&fragment_key) {
1473                    let part_start = range.start.max(fragment_key.start);
1474                    let part_end = range.end.min(fragment_key.end);
1475                    let slice_start = (part_start - fragment_key.start) as usize;
1476                    let slice_end = (part_end - fragment_key.start) as usize;
1477                    parts.push(PageRangePart {
1478                        range: part_start..part_end,
1479                        bytes: bytes.slice(slice_start..slice_end),
1480                    });
1481                } else {
1482                    stale_keys.push(fragment_key);
1483                }
1484            }
1485            for key in stale_keys {
1486                self.remove_uncached_index_entry(key);
1487            }
1488
1489            let mut cursor = range.start;
1490            let mut compacted_parts: Vec<PageRangePart> = Vec::with_capacity(parts.len());
1491            for part in parts {
1492                if part.range.end <= cursor {
1493                    continue;
1494                }
1495
1496                let part = if part.range.start < cursor {
1497                    let offset = (cursor - part.range.start) as usize;
1498                    PageRangePart {
1499                        range: cursor..part.range.end,
1500                        bytes: part.bytes.slice(offset..),
1501                    }
1502                } else {
1503                    part
1504                };
1505
1506                if cursor < part.range.start {
1507                    missing_ranges.push(cursor..part.range.start);
1508                }
1509                cached_bytes += part.range.end - part.range.start;
1510                cached_range_count += 1;
1511                cursor = part.range.end;
1512                compacted_parts.push(part);
1513
1514                if cursor >= range.end {
1515                    break;
1516                }
1517            }
1518
1519            if cursor < range.end {
1520                missing_ranges.push(cursor..range.end);
1521            }
1522            cached_parts.push(compacted_parts);
1523        }
1524
1525        PageRangeLookup {
1526            cached_parts,
1527            missing_ranges,
1528            cached_range_count,
1529            cached_bytes,
1530        }
1531    }
1532
1533    fn insert_ranges(
1534        &self,
1535        file_id: FileId,
1536        row_group_idx: usize,
1537        ranges: &[Range<u64>],
1538        pages: &[Bytes],
1539    ) {
1540        for (range, bytes) in ranges.iter().zip(pages) {
1541            if range.start >= range.end || bytes.len() as u64 != range.end - range.start {
1542                continue;
1543            }
1544
1545            let key = PageFragmentKey::new(file_id, row_group_idx, range);
1546            let bytes = Bytes::copy_from_slice(bytes.as_ref());
1547            let size = page_cache_weight(&key, &bytes);
1548            CACHE_BYTES.with_label_values(&[PAGE_TYPE]).add(size.into());
1549            self.cache.insert(key, bytes);
1550            let mut index = self.index.write().unwrap();
1551            index
1552                .entry(key.group_key())
1553                .or_default()
1554                .insert((key.start, key.end), key);
1555        }
1556    }
1557
1558    fn find_index_candidates(
1559        &self,
1560        file_id: FileId,
1561        row_group_idx: usize,
1562        range: &Range<u64>,
1563    ) -> Vec<PageFragmentKey> {
1564        let group_key = PageFragmentGroupKey {
1565            file_id,
1566            row_group_idx,
1567        };
1568        let index = self.index.read().unwrap();
1569        index
1570            .get(&group_key)
1571            .map(|ranges| {
1572                ranges
1573                    .range(..(range.end, 0))
1574                    .filter_map(|(_, fragment_key)| {
1575                        (fragment_key.end > range.start).then_some(*fragment_key)
1576                    })
1577                    .collect()
1578            })
1579            .unwrap_or_default()
1580    }
1581
1582    fn remove_uncached_index_entry(&self, key: PageFragmentKey) {
1583        let group_key = key.group_key();
1584        let mut index = self.index.write().unwrap();
1585        if self.cache.contains_key(&key) {
1586            return;
1587        }
1588
1589        Self::remove_index_entry_locked(&mut index, group_key, key);
1590    }
1591
1592    fn remove_index_entry(&self, key: PageFragmentKey) {
1593        let group_key = key.group_key();
1594        let mut index = self.index.write().unwrap();
1595        Self::remove_index_entry_locked(&mut index, group_key, key);
1596    }
1597
1598    fn remove_index_entry_locked(
1599        index: &mut PageFragmentIndex,
1600        group_key: PageFragmentGroupKey,
1601        key: PageFragmentKey,
1602    ) {
1603        let Some(ranges) = index.get_mut(&group_key) else {
1604            return;
1605        };
1606
1607        let removed = ranges
1608            .get(&(key.start, key.end))
1609            .is_some_and(|current| current == &key);
1610        if removed {
1611            ranges.remove(&(key.start, key.end));
1612        }
1613        if ranges.is_empty() {
1614            index.remove(&group_key);
1615        }
1616    }
1617}
1618
1619/// Cache key for time series row selector result.
1620#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1621pub struct SelectorResultKey {
1622    /// Id of the SST file.
1623    pub file_id: FileId,
1624    /// Index of the row group.
1625    pub row_group_idx: usize,
1626    /// Time series row selector.
1627    pub selector: TimeSeriesRowSelector,
1628}
1629
1630/// Result stored in the selector result cache.
1631pub enum SelectorResult {
1632    /// Batches in the primary key format.
1633    PrimaryKey(Vec<Batch>),
1634    /// Record batches in the flat format.
1635    Flat(Vec<RecordBatch>),
1636}
1637
1638/// Cached result for time series row selector.
1639pub struct SelectorResultValue {
1640    /// Batches of rows selected by the selector.
1641    pub result: SelectorResult,
1642    /// The read columns of rows.
1643    pub read_cols: ParquetReadColumns,
1644}
1645
1646impl SelectorResultValue {
1647    /// Creates a new selector result value with primary key format.
1648    pub fn new(result: Vec<Batch>, read_cols: ParquetReadColumns) -> SelectorResultValue {
1649        SelectorResultValue {
1650            result: SelectorResult::PrimaryKey(result),
1651            read_cols,
1652        }
1653    }
1654
1655    /// Creates a new selector result value with flat format.
1656    pub fn new_flat(
1657        result: Vec<RecordBatch>,
1658        read_cols: ParquetReadColumns,
1659    ) -> SelectorResultValue {
1660        SelectorResultValue {
1661            result: SelectorResult::Flat(result),
1662            read_cols,
1663        }
1664    }
1665
1666    /// Returns memory used by the value (estimated).
1667    fn estimated_size(&self) -> usize {
1668        match &self.result {
1669            SelectorResult::PrimaryKey(batches) => {
1670                batches.iter().map(|batch| batch.memory_size()).sum()
1671            }
1672            SelectorResult::Flat(batches) => batches.iter().map(record_batch_estimated_size).sum(),
1673        }
1674    }
1675}
1676
1677/// Maps (region id, file id) to fused SST metadata.
1678type SstMetaCache = Cache<SstMetaKey, Arc<CachedSstMeta>>;
1679/// Maps [Value] to a vector that holds this value repeatedly.
1680///
1681/// e.g. `"hello" => ["hello", "hello", "hello"]`
1682type VectorCache = Cache<(ConcreteDataType, Value), VectorRef>;
1683/// Maps (file id, row group id, time series row selector) to [SelectorResultValue].
1684type SelectorResultCache = Cache<SelectorResultKey, Arc<SelectorResultValue>>;
1685/// Maps partition-range scan key to cached flat batches.
1686type RangeResultCache = Cache<RangeScanCacheKey, Arc<RangeScanCacheValue>>;
1687
1688#[cfg(test)]
1689mod tests {
1690    use std::sync::Arc;
1691
1692    use api::v1::SemanticType;
1693    use api::v1::index::{BloomFilterMeta, InvertedIndexMetas};
1694    use datatypes::schema::ColumnSchema;
1695    use datatypes::vectors::Int64Vector;
1696    use puffin::file_metadata::FileMetadata;
1697    use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder};
1698    use store_api::storage::ColumnId;
1699
1700    use super::*;
1701    use crate::cache::index::bloom_filter_index::Tag;
1702    use crate::cache::index::result_cache::PredicateKey;
1703    use crate::cache::test_util::{
1704        parquet_meta, sst_parquet_meta, sst_parquet_meta_with_region_metadata,
1705    };
1706    use crate::read::range_cache::{
1707        RangeScanCacheKey, RangeScanCacheValue, ScanRequestFingerprintBuilder,
1708    };
1709    use crate::read::read_columns::ReadColumns;
1710    use crate::sst::parquet::row_selection::RowGroupSelection;
1711
1712    #[tokio::test]
1713    async fn test_disable_cache() {
1714        let cache = CacheManager::default();
1715        assert!(cache.sst_meta_cache.is_none());
1716        assert!(cache.vector_cache.is_none());
1717        assert!(cache.page_cache.is_none());
1718
1719        let region_id = RegionId::new(1, 1);
1720        let file_id = RegionFileId::new(region_id, FileId::random());
1721        let metadata = parquet_meta();
1722        let mut metrics = MetadataCacheMetrics::default();
1723        cache.put_parquet_meta_data(file_id, metadata, None);
1724        assert!(
1725            cache
1726                .get_parquet_meta_data(file_id, &mut metrics, Default::default())
1727                .await
1728                .is_none()
1729        );
1730
1731        let value = Value::Int64(10);
1732        let vector: VectorRef = Arc::new(Int64Vector::from_slice([10, 10, 10, 10]));
1733        cache.put_repeated_vector(value.clone(), vector.clone());
1734        assert!(
1735            cache
1736                .get_repeated_vector(&ConcreteDataType::int64_datatype(), &value)
1737                .is_none()
1738        );
1739
1740        cache.put_page_ranges(
1741            file_id.file_id(),
1742            1,
1743            &[Range { start: 0, end: 5 }],
1744            &[Bytes::from_static(b"abcde")],
1745        );
1746        assert!(
1747            cache
1748                .get_page_ranges(file_id.file_id(), 1, &[Range { start: 0, end: 5 }])
1749                .is_none()
1750        );
1751
1752        assert!(cache.write_cache().is_none());
1753    }
1754
1755    #[tokio::test]
1756    async fn test_parquet_meta_cache() {
1757        let cache = CacheManager::builder().sst_meta_cache_size(2000).build();
1758        let mut metrics = MetadataCacheMetrics::default();
1759        let region_id = RegionId::new(1, 1);
1760        let file_id = RegionFileId::new(region_id, FileId::random());
1761        assert!(
1762            cache
1763                .get_parquet_meta_data(file_id, &mut metrics, Default::default())
1764                .await
1765                .is_none()
1766        );
1767        let (metadata, region_metadata) = sst_parquet_meta();
1768        cache.put_parquet_meta_data(file_id, metadata, None);
1769        let cached = cache
1770            .get_sst_meta_data(file_id, &mut metrics, Default::default())
1771            .await
1772            .unwrap();
1773        assert_eq!(region_metadata, cached.region_metadata());
1774        assert!(
1775            cached
1776                .parquet_metadata()
1777                .file_metadata()
1778                .key_value_metadata()
1779                .is_none_or(|key_values| {
1780                    key_values
1781                        .iter()
1782                        .all(|key_value| key_value.key != PARQUET_METADATA_KEY)
1783                })
1784        );
1785        cache.remove_parquet_meta_data(file_id);
1786        assert!(
1787            cache
1788                .get_parquet_meta_data(file_id, &mut metrics, Default::default())
1789                .await
1790                .is_none()
1791        );
1792    }
1793
1794    #[tokio::test]
1795    async fn test_parquet_meta_cache_with_provided_region_metadata() {
1796        let cache = CacheManager::builder().sst_meta_cache_size(2000).build();
1797        let mut metrics = MetadataCacheMetrics::default();
1798        let region_id = RegionId::new(1, 1);
1799        let file_id = RegionFileId::new(region_id, FileId::random());
1800        let (metadata, region_metadata) = sst_parquet_meta();
1801
1802        cache.put_parquet_meta_data(file_id, metadata, Some(region_metadata.clone()));
1803
1804        let cached = cache
1805            .get_sst_meta_data(file_id, &mut metrics, Default::default())
1806            .await
1807            .unwrap();
1808        assert!(Arc::ptr_eq(&region_metadata, &cached.region_metadata()));
1809    }
1810
1811    #[tokio::test]
1812    async fn test_parquet_meta_cache_respects_page_index_policy() {
1813        let cache = CacheManager::builder().sst_meta_cache_size(2000).build();
1814        let region_id = RegionId::new(1, 1);
1815        let file_id = RegionFileId::new(region_id, FileId::random());
1816        let (metadata, _) = sst_parquet_meta();
1817
1818        let skip_metadata = Arc::new(
1819            CachedSstMeta::try_new_with_page_index_policy(
1820                "test.parquet",
1821                Arc::unwrap_or_clone(metadata.clone()),
1822                None,
1823                PageIndexPolicy::Skip,
1824            )
1825            .unwrap(),
1826        );
1827        cache.put_sst_meta_data(file_id, skip_metadata);
1828
1829        let mut metrics = MetadataCacheMetrics::default();
1830        assert!(
1831            cache
1832                .get_sst_meta_data(file_id, &mut metrics, PageIndexPolicy::Optional)
1833                .await
1834                .is_none()
1835        );
1836        assert_eq!(1, metrics.cache_miss);
1837
1838        let optional_metadata = Arc::new(
1839            CachedSstMeta::try_new_with_page_index_policy(
1840                "test.parquet",
1841                Arc::unwrap_or_clone(metadata),
1842                None,
1843                PageIndexPolicy::Optional,
1844            )
1845            .unwrap(),
1846        );
1847        cache.put_sst_meta_data(file_id, optional_metadata);
1848
1849        let mut metrics = MetadataCacheMetrics::default();
1850        assert!(
1851            cache
1852                .get_sst_meta_data(file_id, &mut metrics, PageIndexPolicy::Optional)
1853                .await
1854                .is_some()
1855        );
1856        assert_eq!(1, metrics.mem_cache_hit);
1857
1858        let mut metrics = MetadataCacheMetrics::default();
1859        assert!(
1860            cache
1861                .get_sst_meta_data(file_id, &mut metrics, PageIndexPolicy::Skip)
1862                .await
1863                .is_some()
1864        );
1865        assert_eq!(1, metrics.mem_cache_hit);
1866    }
1867
1868    #[test]
1869    fn test_meta_cache_weight_accounts_for_decoded_region_metadata() {
1870        let region_metadata = Arc::new(wide_region_metadata(128));
1871        let json_len = region_metadata.to_json().unwrap().len();
1872        let metadata = sst_parquet_meta_with_region_metadata(region_metadata.clone());
1873        let cached = Arc::new(
1874            CachedSstMeta::try_new("test.parquet", Arc::unwrap_or_clone(metadata)).unwrap(),
1875        );
1876        let key = SstMetaKey(region_metadata.region_id, FileId::random());
1877
1878        assert!(cached.region_metadata_weight > json_len);
1879        assert_eq!(
1880            meta_cache_weight(&key, &cached) as usize,
1881            key.estimated_size()
1882                + parquet_meta_size(&cached.parquet_metadata)
1883                + cached.region_metadata_weight
1884        );
1885    }
1886
1887    #[test]
1888    fn test_repeated_vector_cache() {
1889        let cache = CacheManager::builder().vector_cache_size(4096).build();
1890        let value = Value::Int64(10);
1891        assert!(
1892            cache
1893                .get_repeated_vector(&ConcreteDataType::int64_datatype(), &value)
1894                .is_none()
1895        );
1896        let vector: VectorRef = Arc::new(Int64Vector::from_slice([10, 10, 10, 10]));
1897        cache.put_repeated_vector(value.clone(), vector.clone());
1898        let cached = cache
1899            .get_repeated_vector(&ConcreteDataType::int64_datatype(), &value)
1900            .unwrap();
1901        assert_eq!(vector, cached);
1902    }
1903
1904    #[test]
1905    fn test_page_cache() {
1906        let cache = CacheManager::builder().page_cache_size(1000).build();
1907        let file_id = FileId::random();
1908        let uncached = 0..10;
1909        assert_eq!(
1910            vec![0..10],
1911            cache
1912                .get_page_ranges(file_id, 0, std::slice::from_ref(&uncached))
1913                .unwrap()
1914                .missing_ranges
1915        );
1916
1917        let cached = 100..500;
1918        cache.put_page_ranges(
1919            file_id,
1920            0,
1921            std::slice::from_ref(&cached),
1922            &[Bytes::from(vec![7; 400])],
1923        );
1924
1925        let subrange = 200..300;
1926        let lookup = cache
1927            .get_page_ranges(file_id, 0, std::slice::from_ref(&subrange))
1928            .unwrap();
1929        assert!(lookup.is_fully_cached());
1930        assert_eq!(100, lookup.cached_bytes);
1931        assert_eq!(1, lookup.cached_parts.len());
1932        assert_eq!(200..300, lookup.cached_parts[0][0].range);
1933        assert_eq!(100, lookup.cached_parts[0][0].bytes.len());
1934
1935        let overlapping = 400..600;
1936        let lookup = cache
1937            .get_page_ranges(file_id, 0, std::slice::from_ref(&overlapping))
1938            .unwrap();
1939        assert!(!lookup.is_fully_cached());
1940        assert_eq!(100, lookup.cached_bytes);
1941        assert_eq!(vec![500..600], lookup.missing_ranges);
1942        assert_eq!(400..500, lookup.cached_parts[0][0].range);
1943    }
1944
1945    #[test]
1946    fn test_page_cache_detaches_fragment_bytes() {
1947        let cache = PageRangeCache::new(1000);
1948        let file_id = FileId::random();
1949        let backing = Bytes::from(vec![1; 1024]);
1950        let page = backing.slice(512..522);
1951        let page_ptr = page.as_ptr();
1952        let range = 0..10;
1953
1954        cache.insert_ranges(
1955            file_id,
1956            0,
1957            std::slice::from_ref(&range),
1958            std::slice::from_ref(&page),
1959        );
1960
1961        let lookup = cache.lookup(file_id, 0, std::slice::from_ref(&range));
1962        assert!(lookup.is_fully_cached());
1963        assert_eq!(1, lookup.cached_parts[0].len());
1964        assert_eq!(&page[..], &lookup.cached_parts[0][0].bytes[..]);
1965        assert_ne!(page_ptr, lookup.cached_parts[0][0].bytes.as_ptr());
1966    }
1967
1968    #[test]
1969    fn test_page_cache_replaces_fragment() {
1970        let cache = PageRangeCache::new(1000);
1971        let file_id = FileId::random();
1972        let range = 0..10;
1973
1974        cache.insert_ranges(
1975            file_id,
1976            0,
1977            std::slice::from_ref(&range),
1978            &[Bytes::from(vec![1; 10])],
1979        );
1980        cache.insert_ranges(
1981            file_id,
1982            0,
1983            std::slice::from_ref(&range),
1984            &[Bytes::from(vec![2; 10])],
1985        );
1986        cache.cache.run_pending_tasks();
1987        assert_eq!(
1988            vec![PageFragmentKey::new(file_id, 0, &range)],
1989            cache.find_index_candidates(file_id, 0, &range)
1990        );
1991
1992        let lookup = cache.lookup(file_id, 0, std::slice::from_ref(&range));
1993        assert!(lookup.is_fully_cached());
1994        assert_eq!(&vec![2; 10][..], &lookup.cached_parts[0][0].bytes[..]);
1995    }
1996
1997    #[test]
1998    fn test_page_cache_retains_disjoint_inserts_for_same_row_group() {
1999        let cache = PageRangeCache::new(1000);
2000        let file_id = FileId::random();
2001        let range1 = 0..10;
2002        let range2 = 20..30;
2003
2004        cache.insert_ranges(
2005            file_id,
2006            0,
2007            std::slice::from_ref(&range1),
2008            &[Bytes::from(vec![1; 10])],
2009        );
2010        cache.insert_ranges(
2011            file_id,
2012            0,
2013            std::slice::from_ref(&range2),
2014            &[Bytes::from(vec![2; 10])],
2015        );
2016
2017        let lookup = cache.lookup(file_id, 0, &[range1, range2]);
2018        assert!(lookup.is_fully_cached());
2019        assert_eq!(2, lookup.cached_range_count);
2020        assert_eq!(&vec![1; 10][..], &lookup.cached_parts[0][0].bytes[..]);
2021        assert_eq!(&vec![2; 10][..], &lookup.cached_parts[1][0].bytes[..]);
2022    }
2023
2024    #[test]
2025    fn test_page_cache_fragment_eviction() {
2026        let file_id = FileId::random();
2027        let range = 0..10;
2028        let key = PageFragmentKey::new(file_id, 0, &range);
2029        let page = Bytes::from(vec![1; 10]);
2030        let cache = PageRangeCache::new(page_cache_weight(&key, &page) as u64);
2031
2032        cache.insert_ranges(
2033            file_id,
2034            0,
2035            std::slice::from_ref(&range),
2036            &[Bytes::from(vec![1; 10])],
2037        );
2038        assert!(
2039            cache
2040                .lookup(file_id, 0, std::slice::from_ref(&range))
2041                .is_fully_cached()
2042        );
2043
2044        cache.cache.invalidate(&key);
2045        cache.cache.run_pending_tasks();
2046        assert!(cache.find_index_candidates(file_id, 0, &range).is_empty());
2047
2048        let lookup = cache.lookup(file_id, 0, std::slice::from_ref(&range));
2049        assert!(!lookup.is_fully_cached());
2050        assert_eq!(vec![0..10], lookup.missing_ranges);
2051    }
2052
2053    #[test]
2054    fn test_page_cache_rejects_oversized_fragment() {
2055        let cache = PageRangeCache::new(1);
2056        let file_id = FileId::random();
2057        let range = 0..10;
2058
2059        cache.insert_ranges(
2060            file_id,
2061            0,
2062            std::slice::from_ref(&range),
2063            &[Bytes::from(vec![1; 10])],
2064        );
2065        cache.cache.run_pending_tasks();
2066
2067        let lookup = cache.lookup(file_id, 0, std::slice::from_ref(&range));
2068        assert!(!lookup.is_fully_cached());
2069        assert_eq!(vec![0..10], lookup.missing_ranges);
2070    }
2071
2072    #[test]
2073    fn test_selector_result_cache() {
2074        let cache = CacheManager::builder()
2075            .selector_result_cache_size(1000)
2076            .build();
2077        let file_id = FileId::random();
2078        let key = SelectorResultKey {
2079            file_id,
2080            row_group_idx: 0,
2081            selector: TimeSeriesRowSelector::LastRow,
2082        };
2083        assert!(cache.get_selector_result(&key).is_none());
2084        let result = Arc::new(SelectorResultValue::new(
2085            Vec::new(),
2086            ParquetReadColumns::from_deduped(Vec::new()),
2087        ));
2088        cache.put_selector_result(key, result);
2089        assert!(cache.get_selector_result(&key).is_some());
2090    }
2091
2092    #[test]
2093    fn test_prefilter_result_cache() {
2094        let disabled = CacheManager::builder().build();
2095        let file_id = FileId::random();
2096        let key = PrefilterKey::new(
2097            file_id,
2098            0,
2099            None,
2100            1,
2101            SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()]),
2102        );
2103        let selection = Arc::new(BooleanBuffer::new_set(3));
2104
2105        disabled.put_prefilter_result(key.clone(), selection.clone());
2106        assert!(disabled.get_prefilter_result(&key).is_none());
2107
2108        let cache = Arc::new(
2109            CacheManager::builder()
2110                .prefilter_result_cache_size(1000)
2111                .build(),
2112        );
2113        assert!(cache.get_prefilter_result(&key).is_none());
2114        cache.put_prefilter_result(key.clone(), selection.clone());
2115        assert_eq!(
2116            cache.get_prefilter_result(&key).unwrap().as_ref(),
2117            selection.as_ref()
2118        );
2119
2120        let enable_all = CacheStrategy::EnableAll(cache.clone());
2121        assert!(enable_all.get_prefilter_result(&key).is_some());
2122
2123        let compaction = CacheStrategy::Compaction(cache.clone());
2124        assert!(compaction.get_prefilter_result(&key).is_none());
2125        compaction.put_prefilter_result(key.clone(), selection.clone());
2126        assert!(cache.get_prefilter_result(&key).is_some());
2127
2128        let disabled_strategy = CacheStrategy::Disabled;
2129        assert!(disabled_strategy.get_prefilter_result(&key).is_none());
2130        disabled_strategy.put_prefilter_result(key.clone(), selection);
2131        assert!(cache.get_prefilter_result(&key).is_some());
2132    }
2133
2134    #[test]
2135    fn test_prefilter_key_distinguishes_dimensions() {
2136        let file_id = FileId::random();
2137        let row_selection = RowSelection::from(vec![RowSelector::skip(1), RowSelector::select(3)]);
2138        let other_row_selection =
2139            RowSelection::from(vec![RowSelector::skip(2), RowSelector::select(2)]);
2140        let row_selection = PrefilterKey::row_selection_snapshot(Some(&row_selection));
2141        let other_row_selection = PrefilterKey::row_selection_snapshot(Some(&other_row_selection));
2142        let base = PrefilterKey::new(
2143            file_id,
2144            0,
2145            row_selection.clone(),
2146            1,
2147            SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()]),
2148        );
2149
2150        assert_ne!(
2151            base,
2152            PrefilterKey::new(
2153                FileId::random(),
2154                0,
2155                row_selection.clone(),
2156                1,
2157                SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()])
2158            )
2159        );
2160        assert_ne!(
2161            base,
2162            PrefilterKey::new(
2163                file_id,
2164                1,
2165                row_selection.clone(),
2166                1,
2167                SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()])
2168            )
2169        );
2170        assert_ne!(
2171            base,
2172            PrefilterKey::new(
2173                file_id,
2174                0,
2175                other_row_selection,
2176                1,
2177                SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()])
2178            )
2179        );
2180        assert_ne!(
2181            base,
2182            PrefilterKey::new(
2183                file_id,
2184                0,
2185                row_selection.clone(),
2186                1,
2187                SmallVec::from_vec(vec!["tag_0 IN ([b])".to_string()])
2188            )
2189        );
2190        assert_ne!(
2191            base,
2192            PrefilterKey::new(
2193                file_id,
2194                0,
2195                row_selection.clone(),
2196                2,
2197                SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()])
2198            )
2199        );
2200        let pk_group = PrefilterKey::new(
2201            file_id,
2202            0,
2203            row_selection,
2204            1,
2205            SmallVec::from_vec(vec![
2206                "tag_0 IN ([a])".to_string(),
2207                "tag_1 IN ([x])".to_string(),
2208            ]),
2209        );
2210        assert_ne!(base, pk_group);
2211    }
2212
2213    #[test]
2214    fn test_range_result_cache() {
2215        let cache = Arc::new(
2216            CacheManager::builder()
2217                .range_result_cache_size(1024 * 1024)
2218                .build(),
2219        );
2220
2221        let key = RangeScanCacheKey {
2222            region_id: RegionId::new(1, 1),
2223            row_groups: vec![(FileId::random(), 0)],
2224            scan: ScanRequestFingerprintBuilder {
2225                read_columns: ReadColumns::from_deduped_column_ids(std::iter::empty()),
2226                read_column_types: vec![],
2227                filters: vec!["tag_0 = 1".to_string()],
2228                time_filters: vec![],
2229                series_row_selector: None,
2230                append_mode: false,
2231                filter_deleted: true,
2232                merge_mode: crate::region::options::MergeMode::LastRow,
2233                partition_expr_version: 0,
2234            }
2235            .build(),
2236        };
2237        let value = Arc::new(RangeScanCacheValue::new(Vec::new(), 0));
2238
2239        assert!(cache.get_range_result(&key).is_none());
2240        cache.put_range_result(key.clone(), value.clone());
2241        assert!(cache.get_range_result(&key).is_some());
2242
2243        let enable_all = CacheStrategy::EnableAll(cache.clone());
2244        assert!(enable_all.get_range_result(&key).is_some());
2245
2246        let compaction = CacheStrategy::Compaction(cache.clone());
2247        assert!(compaction.get_range_result(&key).is_none());
2248        compaction.put_range_result(key.clone(), value.clone());
2249        assert!(cache.get_range_result(&key).is_some());
2250
2251        let disabled = CacheStrategy::Disabled;
2252        assert!(disabled.get_range_result(&key).is_none());
2253        disabled.put_range_result(key.clone(), value);
2254        assert!(cache.get_range_result(&key).is_some());
2255    }
2256
2257    #[test]
2258    fn test_range_result_cache_size_configures_limiter() {
2259        let cache_size = 3 * 1024_u64;
2260        let cache = CacheManager::builder()
2261            .range_result_cache_size(cache_size)
2262            .build();
2263
2264        assert_eq!(cache.range_result_cache_size(), cache_size as usize);
2265        assert_eq!(
2266            cache.range_result_memory_limiter().permit_bytes(),
2267            RANGE_RESULT_CONCAT_MEMORY_PERMIT.as_bytes() as usize
2268        );
2269        assert_eq!(
2270            cache.range_result_memory_limiter().available_permits(),
2271            (cache_size as usize).div_ceil(RANGE_RESULT_CONCAT_MEMORY_PERMIT.as_bytes() as usize)
2272        );
2273    }
2274
2275    #[tokio::test]
2276    async fn range_result_memory_limiter_rejects_oversized_request() {
2277        let limiter = RangeResultMemoryLimiter::new(2 * 1024, 1024);
2278        assert_eq!(limiter.available_permits(), 2);
2279
2280        let err = limiter.acquire(10 * 1024).await.unwrap_err();
2281        assert!(
2282            err.to_string().contains("exceeds limiter capacity"),
2283            "unexpected error: {err}"
2284        );
2285        assert_eq!(limiter.available_permits(), 2);
2286    }
2287
2288    #[tokio::test]
2289    async fn range_result_memory_limiter_allows_request_up_to_capacity() {
2290        let limiter = RangeResultMemoryLimiter::new(2 * 1024, 1024);
2291        let permit = limiter.acquire(2 * 1024).await.unwrap();
2292        assert_eq!(limiter.available_permits(), 0);
2293        drop(permit);
2294        assert_eq!(limiter.available_permits(), 2);
2295    }
2296
2297    #[tokio::test]
2298    async fn test_evict_puffin_cache_clears_all_entries() {
2299        use std::collections::{BTreeMap, HashMap};
2300
2301        let cache = CacheManager::builder()
2302            .index_metadata_size(128)
2303            .index_content_size(128)
2304            .index_content_page_size(64)
2305            .index_result_cache_size(128)
2306            .puffin_metadata_size(128)
2307            .build();
2308        let cache = Arc::new(cache);
2309
2310        let region_id = RegionId::new(1, 1);
2311        let index_id = RegionIndexId::new(RegionFileId::new(region_id, FileId::random()), 0);
2312        let column_id: ColumnId = 1;
2313
2314        let bloom_cache = cache.bloom_filter_index_cache().unwrap().clone();
2315        let inverted_cache = cache.inverted_index_cache().unwrap().clone();
2316        let result_cache = cache.index_result_cache().unwrap();
2317        let puffin_metadata_cache = cache.puffin_metadata_cache().unwrap().clone();
2318
2319        let bloom_key = (
2320            index_id.file_id(),
2321            index_id.version,
2322            column_id,
2323            Tag::Skipping,
2324        );
2325        bloom_cache.put_metadata(bloom_key, Arc::new(BloomFilterMeta::default()));
2326        inverted_cache.put_metadata(
2327            (index_id.file_id(), index_id.version),
2328            Arc::new(InvertedIndexMetas::default()),
2329        );
2330        let predicate = PredicateKey::new_bloom(Arc::new(BTreeMap::new()));
2331        let selection = Arc::new(RowGroupSelection::default());
2332        result_cache.put(predicate.clone(), index_id.file_id(), selection);
2333        let file_id_str = index_id.to_string();
2334        let metadata = Arc::new(FileMetadata {
2335            blobs: Vec::new(),
2336            properties: HashMap::new(),
2337        });
2338        puffin_metadata_cache.put_metadata(file_id_str.clone(), metadata);
2339
2340        assert!(bloom_cache.get_metadata(bloom_key).is_some());
2341        assert!(
2342            inverted_cache
2343                .get_metadata((index_id.file_id(), index_id.version))
2344                .is_some()
2345        );
2346        assert!(result_cache.get(&predicate, index_id.file_id()).is_some());
2347        assert!(puffin_metadata_cache.get_metadata(&file_id_str).is_some());
2348
2349        cache.evict_puffin_cache(index_id).await;
2350
2351        assert!(bloom_cache.get_metadata(bloom_key).is_none());
2352        assert!(
2353            inverted_cache
2354                .get_metadata((index_id.file_id(), index_id.version))
2355                .is_none()
2356        );
2357        assert!(result_cache.get(&predicate, index_id.file_id()).is_none());
2358        assert!(puffin_metadata_cache.get_metadata(&file_id_str).is_none());
2359
2360        // Refill caches and evict via CacheStrategy to ensure delegation works.
2361        bloom_cache.put_metadata(bloom_key, Arc::new(BloomFilterMeta::default()));
2362        inverted_cache.put_metadata(
2363            (index_id.file_id(), index_id.version),
2364            Arc::new(InvertedIndexMetas::default()),
2365        );
2366        result_cache.put(
2367            predicate.clone(),
2368            index_id.file_id(),
2369            Arc::new(RowGroupSelection::default()),
2370        );
2371        puffin_metadata_cache.put_metadata(
2372            file_id_str.clone(),
2373            Arc::new(FileMetadata {
2374                blobs: Vec::new(),
2375                properties: HashMap::new(),
2376            }),
2377        );
2378
2379        let strategy = CacheStrategy::EnableAll(cache.clone());
2380        strategy.evict_puffin_cache(index_id).await;
2381
2382        assert!(bloom_cache.get_metadata(bloom_key).is_none());
2383        assert!(
2384            inverted_cache
2385                .get_metadata((index_id.file_id(), index_id.version))
2386                .is_none()
2387        );
2388        assert!(result_cache.get(&predicate, index_id.file_id()).is_none());
2389        assert!(puffin_metadata_cache.get_metadata(&file_id_str).is_none());
2390    }
2391
2392    fn wide_region_metadata(column_count: u32) -> RegionMetadata {
2393        let region_id = RegionId::new(1024, 7);
2394        let mut builder = RegionMetadataBuilder::new(region_id);
2395        let mut primary_key = Vec::new();
2396
2397        for column_id in 0..column_count {
2398            let semantic_type = if column_id < 32 {
2399                primary_key.push(column_id);
2400                SemanticType::Tag
2401            } else {
2402                SemanticType::Field
2403            };
2404            let mut column_schema = ColumnSchema::new(
2405                format!("wide_column_{column_id}"),
2406                ConcreteDataType::string_datatype(),
2407                true,
2408            );
2409            column_schema
2410                .mut_metadata()
2411                .insert(format!("cache_key_{column_id}"), "cache_value".repeat(4));
2412            builder.push_column_metadata(ColumnMetadata {
2413                column_schema,
2414                semantic_type,
2415                column_id,
2416            });
2417        }
2418
2419        builder.push_column_metadata(ColumnMetadata {
2420            column_schema: ColumnSchema::new(
2421                "ts",
2422                ConcreteDataType::timestamp_millisecond_datatype(),
2423                false,
2424            ),
2425            semantic_type: SemanticType::Timestamp,
2426            column_id: column_count,
2427        });
2428        builder.primary_key(primary_key);
2429
2430        builder.build().unwrap()
2431    }
2432}