Minor refactoring preparing for getting columnar integrated in quickwit. (#1911)

2026-06-03 17:10:48 +00:00 · 2023-02-27 14:23:30 +09:00
parent 0a726a0897
commit 8ea97e7d6b
6 changed files with 22 additions and 81 deletions
--- a/src/aggregation/agg_req.rs
+++ b/src/aggregation/agg_req.rs
@@ -124,15 +124,6 @@ impl BucketAggregationInternal {
    }
 }

-/// Extract all fields, where the term directory is used in the tree.
-pub fn get_term_dict_field_names(aggs: &Aggregations) -> HashSet<String> {
-    let mut term_dict_field_names = Default::default();
-    for el in aggs.values() {
-        el.get_term_dict_field_names(&mut term_dict_field_names)
-    }
-    term_dict_field_names
-}
-
 /// Extract all fast field names used in the tree.
 pub fn get_fast_field_names(aggs: &Aggregations) -> HashSet<String> {
    let mut fast_field_names = Default::default();
@@ -155,16 +146,12 @@ pub enum Aggregation {
 }

 impl Aggregation {
-    fn get_term_dict_field_names(&self, term_field_names: &mut HashSet<String>) {
-        if let Aggregation::Bucket(bucket) = self {
-            bucket.get_term_dict_field_names(term_field_names)
-        }
-    }
-
    fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
        match self {
            Aggregation::Bucket(bucket) => bucket.get_fast_field_names(fast_field_names),
-            Aggregation::Metric(metric) => metric.get_fast_field_names(fast_field_names),
+            Aggregation::Metric(metric) => {
+                fast_field_names.insert(metric.get_fast_field_name().to_string());
+            }
        }
    }
 }
@@ -193,14 +180,9 @@ pub struct BucketAggregation {
 }

 impl BucketAggregation {
-    fn get_term_dict_field_names(&self, term_dict_field_names: &mut HashSet<String>) {
-        if let BucketAggregationType::Terms(terms) = &self.bucket_agg {
-            term_dict_field_names.insert(terms.field.to_string());
-        }
-        term_dict_field_names.extend(get_term_dict_field_names(&self.sub_aggregation));
-    }
    fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
-        self.bucket_agg.get_fast_field_names(fast_field_names);
+        let fast_field_name = self.bucket_agg.get_fast_field_name();
+        fast_field_names.insert(fast_field_name.to_string());
        fast_field_names.extend(get_fast_field_names(&self.sub_aggregation));
    }
 }
@@ -220,14 +202,12 @@ pub enum BucketAggregationType {
 }

 impl BucketAggregationType {
-    fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
+    fn get_fast_field_name(&self) -> &str {
        match self {
-            BucketAggregationType::Terms(terms) => fast_field_names.insert(terms.field.to_string()),
-            BucketAggregationType::Range(range) => fast_field_names.insert(range.field.to_string()),
-            BucketAggregationType::Histogram(histogram) => {
-                fast_field_names.insert(histogram.field.to_string())
-            }
-        };
+            BucketAggregationType::Terms(terms) => terms.field.as_str(),
+            BucketAggregationType::Range(range) => range.field.as_str(),
+            BucketAggregationType::Histogram(histogram) => histogram.field.as_str(),
+        }
    }
 }

@@ -262,16 +242,15 @@ pub enum MetricAggregation {
 }

 impl MetricAggregation {
-    fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
-        let fast_field_name = match self {
+    fn get_fast_field_name(&self) -> &str {
+        match self {
            MetricAggregation::Average(avg) => avg.field_name(),
            MetricAggregation::Count(count) => count.field_name(),
            MetricAggregation::Max(max) => max.field_name(),
            MetricAggregation::Min(min) => min.field_name(),
            MetricAggregation::Stats(stats) => stats.field_name(),
            MetricAggregation::Sum(sum) => sum.field_name(),
-        };
-        fast_field_names.insert(fast_field_name.to_string());
+        }
    }
 }

--- a/src/aggregation/agg_tests.rs
+++ b/src/aggregation/agg_tests.rs
@@ -1,8 +1,7 @@
 use serde_json::Value;

 use crate::aggregation::agg_req::{
-    get_term_dict_field_names, Aggregation, Aggregations, BucketAggregation, BucketAggregationType,
-    MetricAggregation,
+    Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
 };
 use crate::aggregation::agg_result::AggregationResults;
 use crate::aggregation::bucket::{RangeAggregation, TermsAggregation};
@@ -432,9 +431,6 @@ fn test_aggregation_level2(
        agg_req
    };

-    let field_names = get_term_dict_field_names(&agg_req);
-    assert_eq!(field_names, vec!["text".to_string()].into_iter().collect());
-
    let agg_res: AggregationResults = if use_distributed_collector {
        let collector = DistributedAggregationCollector::from_aggs(agg_req.clone(), None);

--- a/src/aggregation/bucket/term_agg.rs
+++ b/src/aggregation/bucket/term_agg.rs
@@ -505,8 +505,7 @@ pub(crate) fn cut_off_buckets<T: GetDocCount + Debug>(
 mod tests {
    use super::*;
    use crate::aggregation::agg_req::{
-        get_term_dict_field_names, Aggregation, Aggregations, BucketAggregation,
-        BucketAggregationType, MetricAggregation,
+        Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
    };
    use crate::aggregation::metric::{AverageAggregation, StatsAggregation};
    use crate::aggregation::tests::{
@@ -607,12 +606,6 @@ mod tests {
            serde_json::Value::Null
        );
        assert_eq!(res["my_texts"]["sum_other_doc_count"], 0); // TODO sum_other_doc_count with min_doc_count
-
-        assert_eq!(
-            get_term_dict_field_names(&agg_req),
-            vec!["string_id".to_string(),].into_iter().collect()
-        );
-
        Ok(())
    }

--- a/src/fastfield/mod.rs
+++ b/src/fastfield/mod.rs
@@ -19,9 +19,8 @@
 //!
 //! Read access performance is comparable to that of an array lookup.

-use std::net::Ipv6Addr;
-
 pub use columnar::Column;
+use columnar::MonotonicallyMappableToU64;

 pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet};
 pub use self::error::{FastFieldNotAvailableError, Result};
@@ -37,38 +36,9 @@ mod facet_reader;
 mod readers;
 mod writer;

-/// Trait for types that provide a zero value.
-///
-/// The resulting value is never used, just as placeholder, e.g. for `vec.resize()`.
-pub trait MakeZero {
-    /// Build a default value. This default value is never used, so the value does not
-    /// really matter.
-    fn make_zero() -> Self;
-}
-
-impl<T: FastValue> MakeZero for T {
-    fn make_zero() -> Self {
-        T::from_u64(0)
-    }
-}
-
-impl MakeZero for u128 {
-    fn make_zero() -> Self {
-        0
-    }
-}
-
-impl MakeZero for Ipv6Addr {
-    fn make_zero() -> Self {
-        Ipv6Addr::from(0u128.to_be_bytes())
-    }
-}
-
 /// Trait for types that are allowed for fast fields:
 /// (u64, i64 and f64, bool, DateTime).
-pub trait FastValue:
-    Copy + Send + Sync + columnar::MonotonicallyMappableToU64 + PartialOrd + 'static
-{
+pub trait FastValue: MonotonicallyMappableToU64 {
    /// Returns the `schema::Type` for this FastValue.
    fn to_type() -> Type;
 }
@@ -105,6 +75,7 @@ impl FastValue for DateTime {
 #[cfg(test)]
 mod tests {

+    use std::net::Ipv6Addr;
    use std::ops::{Range, RangeInclusive};
    use std::path::Path;

--- a/src/termdict/sstable_termdict/merger.rs
+++ b/src/termdict/sstable_termdict/merger.rs
@@ -2,7 +2,7 @@ use std::cmp::Ordering;
 use std::collections::BinaryHeap;

 use crate::postings::TermInfo;
-use crate::termdict::{TermOrdinal, TermStreamer};
+use crate::termdict::TermStreamer;

 pub struct HeapItem<'a> {
    pub streamer: TermStreamer<'a>,
--- a/src/tokenizer/stop_word_filter/mod.rs
+++ b/src/tokenizer/stop_word_filter/mod.rs
@@ -18,7 +18,9 @@ use std::sync::Arc;

 use rustc_hash::FxHashSet;

-use super::{BoxTokenStream, Language, Token, TokenFilter, TokenStream};
+#[cfg(feature = "stopwords")]
+use super::Language;
+use super::{BoxTokenStream, Token, TokenFilter, TokenStream};

 /// `TokenFilter` that removes stop words from a token stream
 #[derive(Clone)]