diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a7475232..6c7b7d295 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ Tantivy 0.15.0 ========================= - API Changes. Using Range instead of (start, end) in the API and internals (`FileSlice`, `OwnedBytes`, `Snippets`, ...) This change is breaking but migration is trivial. +- Added an Histogram collector. (@fulmicoton) #994 + Tantivy 0.14.0 ========================= diff --git a/Cargo.toml b/Cargo.toml index d54216238..0d9eb85c7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ chrono = "0.4" smallvec = "1" rayon = "1" lru = "0.6" +fastdivide = "0.3" [target.'cfg(windows)'.dependencies] winapi = "0.3" diff --git a/src/collector/histogram_collector.rs b/src/collector/histogram_collector.rs new file mode 100644 index 000000000..4d28fe8c2 --- /dev/null +++ b/src/collector/histogram_collector.rs @@ -0,0 +1,291 @@ +use fastdivide::DividerU64; +use crate::collector::{Collector, SegmentCollector}; +use crate::fastfield::{FastFieldReader, FastValue}; +use crate::schema::{Field, Type}; +use crate::{DocId, Score}; + +/// Histogram builds an histogram of the values of a fastfield for the +/// collected DocSet. +/// +/// At construction, it is given parameters that define a partition of an interval +/// [min_val, max_val) into N buckets with the same width. +/// The ith bucket is then defined by `[min_val + i * bucket_width, min_val + (i+1) * bucket_width)` +/// +/// An histogram is then defined as a `Vec` of length `num_buckets`, that contains a count of +/// documents for each value bucket. +/// +/// See [`Histogram::new()`]. +/// +/// # Warning +/// +/// f64 field. are not supported. +#[derive(Clone)] +pub struct HistogramCollector { + min_value: u64, + num_buckets: usize, + divider: DividerU64, + field: Field, +} + +impl HistogramCollector { + /// Builds a new HistogramCollector. + /// + /// The scale/range of the histogram is not dynamic. It is required to + /// define it by supplying following parameter: + /// - `min_value`: the minimum value that can be recorded in the histogram. + /// - `bucket_width`: the length of the interval that is associated to each buckets. + /// - `num_buckets`: The overall number of buckets. + /// + /// Together, this parameters define a partition of `[min_value, min_value + num_buckets * bucket_width)` + /// into `num_buckets` intervals of width bucket that we call `bucket`. + /// + /// # Disclaimer + /// This function panics if the field given is of type f64. + pub fn new( + field: Field, + min_value: TFastValue, + bucket_width: u64, + num_buckets: usize, + ) -> HistogramCollector { + let fast_type = TFastValue::to_type(); + assert!(fast_type == Type::U64 || fast_type == Type::I64 || fast_type == Type::Date); + HistogramCollector { + min_value: min_value.to_u64(), + num_buckets, + field, + divider: DividerU64::divide_by(bucket_width), + } + } +} + +struct HistogramComputer { + counts: Vec, + min_value: u64, + divider: DividerU64, +} + +impl HistogramComputer { + #[inline] + pub(crate) fn add_value(&mut self, value: u64) { + if value < self.min_value { + return; + } + let delta = value - self.min_value; + let delta_u64 = delta.to_u64(); + let bucket_id: usize = self.divider.divide(delta_u64) as usize; + if bucket_id < self.counts.len() { + self.counts[bucket_id] += 1; + } + } + + fn harvest(self) -> Vec { + self.counts + } +} +pub struct SegmentHistogramCollector { + histogram_computer: HistogramComputer, + ff_reader: FastFieldReader, +} + +impl SegmentCollector for SegmentHistogramCollector { + type Fruit = Vec; + + fn collect(&mut self, doc: DocId, _score: Score) { + let value = self.ff_reader.get(doc); + self.histogram_computer.add_value(value); + } + + fn harvest(self) -> Self::Fruit { + self.histogram_computer.harvest() + } +} + +impl Collector for HistogramCollector { + type Fruit = Vec; + type Child = SegmentHistogramCollector; + + fn for_segment( + &self, + _segment_local_id: crate::SegmentLocalId, + segment: &crate::SegmentReader, + ) -> crate::Result { + let ff_reader = segment.fast_fields().u64_lenient(self.field)?; + Ok(SegmentHistogramCollector { + histogram_computer: HistogramComputer { + counts: vec![0; self.num_buckets], + min_value: self.min_value, + divider: self.divider, + }, + ff_reader, + }) + } + + fn requires_scoring(&self) -> bool { + false + } + + fn merge_fruits(&self, child_histograms: Vec>) -> crate::Result> { + Ok(add_vecs(child_histograms, self.num_buckets)) + } +} + +pub fn add_arrays_into(acc: &mut [u64], add: &[u64]) { + assert_eq!(acc.len(), add.len()); + for (dest_bucket, bucket_count) in acc.iter_mut().zip(add) { + *dest_bucket += bucket_count; + } +} + +fn add_vecs(mut vals_list: Vec>, len: usize) -> Vec { + let mut acc = vals_list.pop().unwrap_or_else(|| vec![0u64; len]); + assert_eq!(acc.len(), len); + for vals in vals_list { + add_arrays_into(&mut acc, &vals); + } + acc +} + +#[cfg(test)] +mod tests { + use super::{add_vecs, HistogramComputer, HistogramCollector}; + use fastdivide::DividerU64; + use query::AllQuery; + use crate::chrono::{TimeZone, Utc}; + use crate::schema::{Schema, FAST}; + use crate::{Index, doc, query}; + + #[test] + fn test_add_histograms_simple() { + assert_eq!( + add_vecs(vec![vec![1, 0, 3], vec![11, 2, 3], vec![0, 0, 1]], 3), + vec![12, 2, 7] + ) + } + + #[test] + fn test_add_histograms_empty() { + assert_eq!(add_vecs(vec![], 3), vec![0, 0, 0]) + } + + #[test] + fn test_histogram_builder_simple() { + // [1..3) + // [3..5) + // .. + // [9..11) + let mut histogram_computer = HistogramComputer { + counts: vec![0; 5], + min_value: 1, + divider: DividerU64::divide_by(2), + }; + histogram_computer.add_value(1); + histogram_computer.add_value(7); + assert_eq!(histogram_computer.harvest(), vec![1, 0, 0, 1, 0]); + } + + #[test] + fn test_histogram_too_low_is_ignored() { + let mut histogram_computer = HistogramComputer { + counts: vec![0; 5], + min_value: 2, + divider: DividerU64::divide_by(2), + }; + histogram_computer.add_value(0); + assert_eq!(histogram_computer.harvest(), vec![0, 0, 0, 0, 0]); + } + + #[test] + fn test_histogram_too_high_is_ignored() { + let mut histogram_computer = HistogramComputer { + counts: vec![0u64; 5], + min_value: 0, + divider: DividerU64::divide_by(2), + }; + histogram_computer.add_value(10); + assert_eq!(histogram_computer.harvest(), vec![0, 0, 0, 0, 0]); + } + #[test] + fn test_no_segments() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let val_field = schema_builder.add_u64_field("val_field", FAST); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let reader = index.reader()?; + let searcher = reader.searcher(); + let all_query = AllQuery; + let histogram_collector = HistogramCollector::new(val_field, 0u64, 2, 5); + let histogram = searcher.search(&all_query, &histogram_collector)?; + assert_eq!(histogram, vec![0; 5]); + Ok(()) + } + + #[test] + fn test_histogram_i64() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let val_field = schema_builder.add_i64_field("val_field", FAST); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut writer = index.writer_with_num_threads(1, 4_000_000)?; + writer.add_document(doc!(val_field=>12i64)); + writer.add_document(doc!(val_field=>-30i64)); + writer.add_document(doc!(val_field=>-12i64)); + writer.add_document(doc!(val_field=>-10i64)); + writer.commit()?; + let reader = index.reader()?; + let searcher = reader.searcher(); + let all_query = AllQuery; + let histogram_collector = HistogramCollector::new(val_field, -20i64, 10u64, 4); + let histogram = searcher.search(&all_query, &histogram_collector)?; + assert_eq!(histogram, vec![1, 1, 0, 1]); + Ok(()) + } + + #[test] + fn test_histogram_merge() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let val_field = schema_builder.add_i64_field("val_field", FAST); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut writer = index.writer_with_num_threads(1, 4_000_000)?; + writer.add_document(doc!(val_field=>12i64)); + writer.commit()?; + writer.add_document(doc!(val_field=>-30i64)); + writer.commit()?; + writer.add_document(doc!(val_field=>-12i64)); + writer.commit()?; + writer.add_document(doc!(val_field=>-10i64)); + writer.commit()?; + let reader = index.reader()?; + let searcher = reader.searcher(); + let all_query = AllQuery; + let histogram_collector = HistogramCollector::new(val_field, -20i64, 10u64, 4); + let histogram = searcher.search(&all_query, &histogram_collector)?; + assert_eq!(histogram, vec![1, 1, 0, 1]); + Ok(()) + } + + #[test] + fn test_histogram_dates() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let date_field = schema_builder.add_date_field("date_field", FAST); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut writer = index.writer_with_num_threads(1, 4_000_000)?; + writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0))); + writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0))); + writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0))); + writer.commit()?; + let reader = index.reader()?; + let searcher = reader.searcher(); + let all_query = AllQuery; + let week_histogram_collector = HistogramCollector::new( + date_field, + Utc.ymd(1980, 1, 1).and_hms(0, 0, 0), + 3600 * 24 * 365, // it is just for a unit test... sorry leap years. + 10, + ); + let week_histogram = searcher.search(&all_query, &week_histogram_collector)?; + assert_eq!(week_histogram, vec![0, 0, 1, 1, 0, 0, 1, 0, 0, 0]); + Ok(()) + } +} diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 6e567047e..79f4efa12 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -93,6 +93,9 @@ use downcast_rs::impl_downcast; mod count_collector; pub use self::count_collector::Count; +mod histogram_collector; +pub use histogram_collector::HistogramCollector; + mod multi_collector; pub use self::multi_collector::MultiCollector;