make Term::as_slice public

fix doc store cache docs (#1821 )
* fix doc store cache docs addresses an issue reported in #1820 * rename doc_store_cache_size
2026-01-05 08:42:54 +00:00 · 2023-02-09 13:43:01 +01:00 · 2023-01-23 07:06:49 +01:00
7 changed files with 51 additions and 104 deletions
--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -11,7 +11,7 @@ use crate::aggregation::agg_req_with_accessor::{
 use crate::aggregation::intermediate_agg_result::{
    IntermediateBucketResult, IntermediateTermBucketEntry, IntermediateTermBucketResult,
 };
-use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
+use crate::aggregation::segment_agg_result::{BucketCount, SegmentAggregationResultsCollector};
 use crate::error::DataCorruption;
 use crate::fastfield::MultiValuedFastFieldReader;
 use crate::schema::Type;
@@ -268,18 +268,21 @@ impl TermBuckets {
        term_ids: &[u64],
        doc: DocId,
        sub_aggregation: &AggregationsWithAccessor,
+        bucket_count: &BucketCount,
        blueprint: &Option<SegmentAggregationResultsCollector>,
    ) -> crate::Result<()> {
        for &term_id in term_ids {
-            let entry = self
-                .entries
-                .entry(term_id as u32)
-                .or_insert_with(|| TermBucketEntry::from_blueprint(blueprint));
+            let entry = self.entries.entry(term_id as u32).or_insert_with(|| {
+                bucket_count.add_count(1);
+
+                TermBucketEntry::from_blueprint(blueprint)
+            });
            entry.doc_count += 1;
            if let Some(sub_aggregations) = entry.sub_aggregations.as_mut() {
                sub_aggregations.collect(doc, sub_aggregation)?;
            }
        }
+        bucket_count.validate_bucket_count()?;

        Ok(())
    }
@@ -369,7 +372,7 @@ impl SegmentTermCollector {
            }
            OrderTarget::SubAggregation(_name) => {
                // don't sort and cut off since it's hard to make assumptions on the quality of the
-                // results when cutting off due to unknown nature of the sub_aggregation (possible
+                // results when cutting off du to unknown nature of the sub_aggregation (possible
                // to check).
            }
            OrderTarget::Count => {
@@ -409,10 +412,6 @@ impl SegmentTermCollector {
        if self.req.min_doc_count == 0 {
            let mut stream = term_dict.stream()?;
            while let Some((key, _ord)) = stream.next() {
-                if dict.len() >= self.req.segment_size as usize {
-                    break;
-                }
-
                let key = std::str::from_utf8(key)
                    .map_err(|utf8_err| DataCorruption::comment_only(utf8_err.to_string()))?;
                if !dict.contains_key(key) {
@@ -434,8 +433,6 @@ impl SegmentTermCollector {
            sum_other_doc_count += sum_other_docs;
            dict = dict_entries.into_iter().collect();
        }
-        agg_with_accessor.bucket_count.add_count(dict.len() as u32);
-        agg_with_accessor.bucket_count.validate_bucket_count()?;

        Ok(IntermediateBucketResult::Terms(
            IntermediateTermBucketResult {
@@ -472,24 +469,28 @@ impl SegmentTermCollector {
                &vals1,
                docs[0],
                &bucket_with_accessor.sub_aggregation,
+                &bucket_with_accessor.bucket_count,
                &self.blueprint,
            )?;
            self.term_buckets.increment_bucket(
                &vals2,
                docs[1],
                &bucket_with_accessor.sub_aggregation,
+                &bucket_with_accessor.bucket_count,
                &self.blueprint,
            )?;
            self.term_buckets.increment_bucket(
                &vals3,
                docs[2],
                &bucket_with_accessor.sub_aggregation,
+                &bucket_with_accessor.bucket_count,
                &self.blueprint,
            )?;
            self.term_buckets.increment_bucket(
                &vals4,
                docs[3],
                &bucket_with_accessor.sub_aggregation,
+                &bucket_with_accessor.bucket_count,
                &self.blueprint,
            )?;
        }
@@ -500,6 +501,7 @@ impl SegmentTermCollector {
                &vals1,
                doc,
                &bucket_with_accessor.sub_aggregation,
+                &bucket_with_accessor.bucket_count,
                &self.blueprint,
            )?;
        }
@@ -1134,33 +1136,6 @@ mod tests {
        assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
        assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);

-        let agg_req: Aggregations = vec![(
-            "my_texts".to_string(),
-            Aggregation::Bucket(BucketAggregation {
-                bucket_agg: BucketAggregationType::Terms(TermsAggregation {
-                    field: "string_id".to_string(),
-                    min_doc_count: Some(0),
-                    size: Some(1),
-                    ..Default::default()
-                }),
-                sub_aggregation: Default::default(),
-            }),
-        )]
-        .into_iter()
-        .collect();
-
-        // searching for terma, but min_doc_count will return all terms
-        let res = exec_request_with_query(agg_req, &index, Some(("string_id", "terma")))?;
-
-        assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
-        assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
-        assert_eq!(
-            res["my_texts"]["buckets"][1]["key"],
-            serde_json::Value::Null
-        );
-        assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
-        assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
-
        Ok(())
    }

@@ -1239,27 +1214,6 @@ mod tests {

        let index = get_test_index_from_terms(true, &terms_per_segment)?;

-        let agg_req: Aggregations = vec![(
-            "my_texts".to_string(),
-            Aggregation::Bucket(BucketAggregation {
-                bucket_agg: BucketAggregationType::Terms(TermsAggregation {
-                    field: "string_id".to_string(),
-                    // min_doc_count: Some(0),
-                    ..Default::default()
-                }),
-                sub_aggregation: Default::default(),
-            }),
-        )]
-        .into_iter()
-        .collect();
-
-        let res = exec_request_with_query(agg_req, &index, None);
-
-        assert!(res.is_ok());
-
-        // This request has min_doc_count set to 0
-        // That means we load potentially the whole dict
-        // Make sure the bucket count is still fine
        let agg_req: Aggregations = vec![(
            "my_texts".to_string(),
            Aggregation::Bucket(BucketAggregation {
@@ -1274,24 +1228,6 @@ mod tests {
        .into_iter()
        .collect();

-        let res = exec_request_with_query(agg_req, &index, None);
-        assert!(res.is_ok());
-
-        let agg_req: Aggregations = vec![(
-            "my_texts".to_string(),
-            Aggregation::Bucket(BucketAggregation {
-                bucket_agg: BucketAggregationType::Terms(TermsAggregation {
-                    field: "string_id".to_string(),
-                    // min_doc_count: Some(0),
-                    size: Some(70_000),
-                    ..Default::default()
-                }),
-                sub_aggregation: Default::default(),
-            }),
-        )]
-        .into_iter()
-        .collect();
-
        let res = exec_request_with_query(agg_req, &index, None);

        assert!(res.is_err());
@@ -1448,10 +1384,14 @@ mod bench {
        let mut collector = get_collector_with_buckets(total_terms);
        let vals = get_rand_terms(total_terms, num_terms);
        let aggregations_with_accessor: AggregationsWithAccessor = Default::default();
+        let bucket_count: BucketCount = BucketCount {
+            bucket_count: Default::default(),
+            max_bucket_count: 1_000_001u32,
+        };
        b.iter(|| {
            for &val in &vals {
                collector
-                    .increment_bucket(&[val], 0, &aggregations_with_accessor, &None)
+                    .increment_bucket(&[val], 0, &aggregations_with_accessor, &bucket_count, &None)
                    .unwrap();
            }
        })
--- a/src/core/searcher.rs
+++ b/src/core/searcher.rs
@@ -249,7 +249,7 @@ impl SearcherInner {
        index: Index,
        segment_readers: Vec<SegmentReader>,
        generation: TrackedObject<SearcherGeneration>,
-        doc_store_cache_size: usize,
+        doc_store_cache_num_blocks: usize,
    ) -> io::Result<SearcherInner> {
        assert_eq!(
            &segment_readers
@@ -261,7 +261,7 @@ impl SearcherInner {
        );
        let store_readers: Vec<StoreReader> = segment_readers
            .iter()
-            .map(|segment_reader| segment_reader.get_store_reader(doc_store_cache_size))
+            .map(|segment_reader| segment_reader.get_store_reader(doc_store_cache_num_blocks))
            .collect::<io::Result<Vec<_>>>()?;

        Ok(SearcherInner {
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -134,9 +134,12 @@ impl SegmentReader {
        &self.fieldnorm_readers
    }

-    /// Accessor to the segment's `StoreReader`.
-    pub fn get_store_reader(&self, cache_size: usize) -> io::Result<StoreReader> {
-        StoreReader::open(self.store_file.clone(), cache_size)
+    /// Accessor to the segment's [`StoreReader`](crate::store::StoreReader).
+    ///
+    /// `cache_num_blocks` sets the number of decompressed blocks to be cached in an LRU.
+    /// The size of blocks is configurable, this should be reflexted in the
+    pub fn get_store_reader(&self, cache_num_blocks: usize) -> io::Result<StoreReader> {
+        StoreReader::open(self.store_file.clone(), cache_num_blocks)
    }

    /// Open a new segment for reading.
--- a/src/reader/mod.rs
+++ b/src/reader/mod.rs
@@ -44,7 +44,7 @@ pub struct IndexReaderBuilder {
    index: Index,
    warmers: Vec<Weak<dyn Warmer>>,
    num_warming_threads: usize,
-    doc_store_cache_size: usize,
+    doc_store_cache_num_blocks: usize,
 }

 impl IndexReaderBuilder {
@@ -55,7 +55,7 @@ impl IndexReaderBuilder {
            index,
            warmers: Vec::new(),
            num_warming_threads: 1,
-            doc_store_cache_size: DOCSTORE_CACHE_CAPACITY,
+            doc_store_cache_num_blocks: DOCSTORE_CACHE_CAPACITY,
        }
    }

@@ -72,7 +72,7 @@ impl IndexReaderBuilder {
            searcher_generation_inventory.clone(),
        )?;
        let inner_reader = InnerIndexReader::new(
-            self.doc_store_cache_size,
+            self.doc_store_cache_num_blocks,
            self.index,
            warming_state,
            searcher_generation_inventory,
@@ -119,8 +119,11 @@ impl IndexReaderBuilder {
    ///
    /// The doc store readers cache by default DOCSTORE_CACHE_CAPACITY(100) decompressed blocks.
    #[must_use]
-    pub fn doc_store_cache_size(mut self, doc_store_cache_size: usize) -> IndexReaderBuilder {
-        self.doc_store_cache_size = doc_store_cache_size;
+    pub fn doc_store_cache_num_blocks(
+        mut self,
+        doc_store_cache_num_blocks: usize,
+    ) -> IndexReaderBuilder {
+        self.doc_store_cache_num_blocks = doc_store_cache_num_blocks;
        self
    }

@@ -151,7 +154,7 @@ impl TryInto<IndexReader> for IndexReaderBuilder {
 }

 struct InnerIndexReader {
-    doc_store_cache_size: usize,
+    doc_store_cache_num_blocks: usize,
    index: Index,
    warming_state: WarmingState,
    searcher: arc_swap::ArcSwap<SearcherInner>,
@@ -161,7 +164,7 @@ struct InnerIndexReader {

 impl InnerIndexReader {
    fn new(
-        doc_store_cache_size: usize,
+        doc_store_cache_num_blocks: usize,
        index: Index,
        warming_state: WarmingState,
        // The searcher_generation_inventory is not used as source, but as target to track the
@@ -172,13 +175,13 @@ impl InnerIndexReader {

        let searcher = Self::create_searcher(
            &index,
-            doc_store_cache_size,
+            doc_store_cache_num_blocks,
            &warming_state,
            &searcher_generation_counter,
            &searcher_generation_inventory,
        )?;
        Ok(InnerIndexReader {
-            doc_store_cache_size,
+            doc_store_cache_num_blocks,
            index,
            warming_state,
            searcher: ArcSwap::from(searcher),
@@ -214,7 +217,7 @@ impl InnerIndexReader {

    fn create_searcher(
        index: &Index,
-        doc_store_cache_size: usize,
+        doc_store_cache_num_blocks: usize,
        warming_state: &WarmingState,
        searcher_generation_counter: &Arc<AtomicU64>,
        searcher_generation_inventory: &Inventory<SearcherGeneration>,
@@ -232,7 +235,7 @@ impl InnerIndexReader {
            index.clone(),
            segment_readers,
            searcher_generation,
-            doc_store_cache_size,
+            doc_store_cache_num_blocks,
        )?);

        warming_state.warm_new_searcher_generation(&searcher.clone().into())?;
@@ -242,7 +245,7 @@ impl InnerIndexReader {
    fn reload(&self) -> crate::Result<()> {
        let searcher = Self::create_searcher(
            &self.index,
-            self.doc_store_cache_size,
+            self.doc_store_cache_num_blocks,
            &self.warming_state,
            &self.searcher_generation_counter,
            &self.searcher_generation_inventory,
--- a/src/schema/term.rs
+++ b/src/schema/term.rs
@@ -375,7 +375,7 @@ where B: AsRef<[u8]>
    ///
    /// Do NOT rely on this byte representation in the index.
    /// This value is likely to change in the future.
-    pub(crate) fn as_slice(&self) -> &[u8] {
+    pub fn as_slice(&self) -> &[u8] {
        self.0.as_ref()
    }
 }
--- a/src/store/mod.rs
+++ b/src/store/mod.rs
@@ -4,8 +4,8 @@
 //! order to be handled in the `Store`.
 //!
 //! Internally, documents (or rather their stored fields) are serialized to a buffer.
-//! When the buffer exceeds 16K, the buffer is compressed using `brotli`, `LZ4` or `snappy`
-//! and the resulting block is written to disk.
+//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed using `brotli`,
+//! `LZ4` or `snappy` and the resulting block is written to disk.
 //!
 //! One can then request for a specific `DocId`.
 //! A skip list helps navigating to the right block,
@@ -28,8 +28,6 @@
 //! - at the segment level, the
 //! [`SegmentReader`'s `doc` method](../struct.SegmentReader.html#method.doc)
 //! - at the index level, the [`Searcher::doc()`](crate::Searcher::doc) method
-//!
-//! !

 mod compressors;
 mod decompressors;
--- a/src/store/reader.rs
+++ b/src/store/reader.rs
@@ -114,7 +114,10 @@ impl Sum for CacheStats {

 impl StoreReader {
    /// Opens a store reader
-    pub fn open(store_file: FileSlice, cache_size: usize) -> io::Result<StoreReader> {
+    ///
+    /// `cache_num_blocks` sets the number of decompressed blocks to be cached in an LRU.
+    /// The size of blocks is configurable, this should be reflexted in the
+    pub fn open(store_file: FileSlice, cache_num_blocks: usize) -> io::Result<StoreReader> {
        let (footer, data_and_offset) = DocStoreFooter::extract_footer(store_file)?;

        let (data_file, offset_index_file) = data_and_offset.split(footer.offset as usize);
@@ -125,8 +128,8 @@ impl StoreReader {
            decompressor: footer.decompressor,
            data: data_file,
            cache: BlockCache {
-                cache: NonZeroUsize::new(cache_size)
-                    .map(|cache_size| Mutex::new(LruCache::new(cache_size))),
+                cache: NonZeroUsize::new(cache_num_blocks)
+                    .map(|cache_num_blocks| Mutex::new(LruCache::new(cache_num_blocks))),
                cache_hits: Default::default(),
                cache_misses: Default::default(),
            },
Author	SHA1	Message	Date
trinity-1686a	116c6d3621	make Term::as_slice public	2023-02-09 13:43:01 +01:00
PSeitz	0f20787917	fix doc store cache docs (#1821 ) * fix doc store cache docs addresses an issue reported in #1820 * rename doc_store_cache_size	2023-01-23 07:06:49 +01:00