add failing test check_num_columnar_fields

remove allocations in split compound words (#2080 )
* remove allocations in split compound words * clear reused data
2026-01-13 12:32:55 +00:00 · 2023-07-13 18:19:02 +09:00 · 2023-07-13 09:43:02 +09:00 · 2023-07-13 09:42:21 +09:00
5 changed files with 68 additions and 36 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -49,7 +49,7 @@ murmurhash32 = "0.3.0"
 time = { version = "0.3.10", features = ["serde-well-known"] }
 smallvec = "1.8.0"
 rayon = "1.5.2"
-lru = "0.10.0"
+lru = "0.11.0"
 fastdivide = "0.4.0"
 itertools = "0.11.0"
 measure_time = "0.8.2"
--- a/src/aggregation/agg_req_with_accessor.rs
+++ b/src/aggregation/agg_req_with_accessor.rs
@@ -123,15 +123,6 @@ impl AggregationWithAccessor {
            column_block_accessor: Default::default(),
        })
    }
-
-    /// Swaps the accessor and field type with the second accessor and field type.
-    /// This way we can use the same code for both aggregations.
-    pub(crate) fn swap_accessor(&mut self) {
-        if let Some(accessor) = self.accessor2.as_mut() {
-            std::mem::swap(&mut accessor.0, &mut self.accessor);
-            std::mem::swap(&mut accessor.1, &mut self.field_type);
-        }
-    }
 }

 fn get_numeric_or_date_column_types() -> &'static [ColumnType] {
--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -263,9 +263,9 @@ impl SegmentAggregationCollector for SegmentTermCollectorComposite {
        agg_with_accessor: &mut AggregationsWithAccessor,
    ) -> crate::Result<()> {
        self.term_agg1.collect_block(&[doc], agg_with_accessor)?;
-        agg_with_accessor.aggs.values[self.accessor_idx].swap_accessor();
+        self.swap_accessor(&mut agg_with_accessor.aggs.values[self.accessor_idx]);
        self.term_agg2.collect_block(&[doc], agg_with_accessor)?;
-        agg_with_accessor.aggs.values[self.accessor_idx].swap_accessor();
+        self.swap_accessor(&mut agg_with_accessor.aggs.values[self.accessor_idx]);
        Ok(())
    }

@@ -276,22 +276,33 @@ impl SegmentAggregationCollector for SegmentTermCollectorComposite {
        agg_with_accessor: &mut AggregationsWithAccessor,
    ) -> crate::Result<()> {
        self.term_agg1.collect_block(docs, agg_with_accessor)?;
-        agg_with_accessor.aggs.values[self.accessor_idx].swap_accessor();
+        self.swap_accessor(&mut agg_with_accessor.aggs.values[self.accessor_idx]);
        self.term_agg2.collect_block(docs, agg_with_accessor)?;
-        agg_with_accessor.aggs.values[self.accessor_idx].swap_accessor();
+        self.swap_accessor(&mut agg_with_accessor.aggs.values[self.accessor_idx]);
+
        Ok(())
    }

    fn flush(&mut self, agg_with_accessor: &mut AggregationsWithAccessor) -> crate::Result<()> {
        self.term_agg1.flush(agg_with_accessor)?;
-        agg_with_accessor.aggs.values[self.accessor_idx].swap_accessor();
+        self.swap_accessor(&mut agg_with_accessor.aggs.values[self.accessor_idx]);
        self.term_agg2.flush(agg_with_accessor)?;
-        agg_with_accessor.aggs.values[self.accessor_idx].swap_accessor();
+        self.swap_accessor(&mut agg_with_accessor.aggs.values[self.accessor_idx]);
+
        Ok(())
    }
 }

 impl SegmentTermCollectorComposite {
+    /// Swaps the accessor and field type with the second accessor and field type.
+    /// This way we can use the same code for both aggregations.
+    fn swap_accessor(&self, aggregations: &mut AggregationWithAccessor) {
+        if let Some(accessor) = aggregations.accessor2.as_mut() {
+            std::mem::swap(&mut accessor.0, &mut aggregations.accessor);
+            std::mem::swap(&mut accessor.1, &mut aggregations.field_type);
+        }
+    }
+
    pub(crate) fn from_req_and_validate(
        req: &TermsAggregation,
        sub_aggregations: &mut AggregationsWithAccessor,
--- a/src/fastfield/mod.rs
+++ b/src/fastfield/mod.rs
@@ -1291,4 +1291,28 @@ mod tests {
        let vals: Vec<i64> = column.values_for_doc(0u32).collect();
        assert_eq!(&vals, &[33]);
    }
+
+    #[test]
+    fn check_num_columnar_fields() -> crate::Result<()> {
+        let mut schema_builder = Schema::builder();
+        let id_field = schema_builder.add_text_field("id", FAST);
+        let index = Index::create_in_ram(schema_builder.build());
+        {
+            let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
+            index_writer.set_merge_policy(Box::new(NoMergePolicy));
+            index_writer.add_document(doc!(
+                id_field => 1u64,
+            ))?;
+            index_writer.commit()?;
+        }
+
+        let reader = index.reader().unwrap();
+
+        let searcher = reader.searcher();
+        let ff_reader = searcher.segment_reader(0).fast_fields();
+        let fields = ff_reader.u64_lenient_for_type_all(None, "id").unwrap();
+        assert_eq!(fields.len(), 1);
+
+        Ok(())
+    }
 }
--- a/src/tokenizer/split_compound_words.rs
+++ b/src/tokenizer/split_compound_words.rs
@@ -86,6 +86,8 @@ impl TokenFilter for SplitCompoundWords {
        SplitCompoundWordsFilter {
            dict: self.dict,
            inner: tokenizer,
+            cuts: Vec::new(),
+            parts: Vec::new(),
        }
    }
 }
@@ -94,29 +96,33 @@ impl TokenFilter for SplitCompoundWords {
 pub struct SplitCompoundWordsFilter<T> {
    dict: AhoCorasick,
    inner: T,
-}
-
-impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
-    type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
-
-    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
-        SplitCompoundWordsTokenStream {
-            dict: self.dict.clone(),
-            tail: self.inner.token_stream(text),
-            cuts: Vec::new(),
-            parts: Vec::new(),
-        }
-    }
-}
-
-pub struct SplitCompoundWordsTokenStream<T> {
-    dict: AhoCorasick,
-    tail: T,
    cuts: Vec<usize>,
    parts: Vec<Token>,
 }

-impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
+impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
+    type TokenStream<'a> = SplitCompoundWordsTokenStream<'a, T::TokenStream<'a>>;
+
+    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
+        self.cuts.clear();
+        self.parts.clear();
+        SplitCompoundWordsTokenStream {
+            dict: self.dict.clone(),
+            tail: self.inner.token_stream(text),
+            cuts: &mut self.cuts,
+            parts: &mut self.parts,
+        }
+    }
+}
+
+pub struct SplitCompoundWordsTokenStream<'a, T> {
+    dict: AhoCorasick,
+    tail: T,
+    cuts: &'a mut Vec<usize>,
+    parts: &'a mut Vec<Token>,
+}
+
+impl<'a, T: TokenStream> SplitCompoundWordsTokenStream<'a, T> {
    // Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
    // can fully be split into consecutive matches against `self.dict`.
    fn split(&mut self) {
@@ -152,7 +158,7 @@ impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
    }
 }

-impl<T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<T> {
+impl<'a, T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<'a, T> {
    fn advance(&mut self) -> bool {
        self.parts.pop();