Added an histogram collector.

Closes #994
This commit is contained in:
Paul Masurel
2021-03-18 16:54:42 +09:00
parent 52b1eb2c37
commit 761298ff00
4 changed files with 297 additions and 0 deletions

View File

@@ -2,6 +2,8 @@ Tantivy 0.15.0
=========================
- API Changes. Using Range instead of (start, end) in the API and internals (`FileSlice`, `OwnedBytes`, `Snippets`, ...)
This change is breaking but migration is trivial.
- Added an Histogram collector. (@fulmicoton) #994
Tantivy 0.14.0
=========================

View File

@@ -48,6 +48,7 @@ chrono = "0.4"
smallvec = "1"
rayon = "1"
lru = "0.6"
fastdivide = "0.3"
[target.'cfg(windows)'.dependencies]
winapi = "0.3"

View File

@@ -0,0 +1,291 @@
use fastdivide::DividerU64;
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::{FastFieldReader, FastValue};
use crate::schema::{Field, Type};
use crate::{DocId, Score};
/// Histogram builds an histogram of the values of a fastfield for the
/// collected DocSet.
///
/// At construction, it is given parameters that define a partition of an interval
/// [min_val, max_val) into N buckets with the same width.
/// The ith bucket is then defined by `[min_val + i * bucket_width, min_val + (i+1) * bucket_width)`
///
/// An histogram is then defined as a `Vec<u64>` of length `num_buckets`, that contains a count of
/// documents for each value bucket.
///
/// See [`Histogram::new()`].
///
/// # Warning
///
/// f64 field. are not supported.
#[derive(Clone)]
pub struct HistogramCollector {
min_value: u64,
num_buckets: usize,
divider: DividerU64,
field: Field,
}
impl HistogramCollector {
/// Builds a new HistogramCollector.
///
/// The scale/range of the histogram is not dynamic. It is required to
/// define it by supplying following parameter:
/// - `min_value`: the minimum value that can be recorded in the histogram.
/// - `bucket_width`: the length of the interval that is associated to each buckets.
/// - `num_buckets`: The overall number of buckets.
///
/// Together, this parameters define a partition of `[min_value, min_value + num_buckets * bucket_width)`
/// into `num_buckets` intervals of width bucket that we call `bucket`.
///
/// # Disclaimer
/// This function panics if the field given is of type f64.
pub fn new<TFastValue: FastValue>(
field: Field,
min_value: TFastValue,
bucket_width: u64,
num_buckets: usize,
) -> HistogramCollector {
let fast_type = TFastValue::to_type();
assert!(fast_type == Type::U64 || fast_type == Type::I64 || fast_type == Type::Date);
HistogramCollector {
min_value: min_value.to_u64(),
num_buckets,
field,
divider: DividerU64::divide_by(bucket_width),
}
}
}
struct HistogramComputer {
counts: Vec<u64>,
min_value: u64,
divider: DividerU64,
}
impl HistogramComputer {
#[inline]
pub(crate) fn add_value(&mut self, value: u64) {
if value < self.min_value {
return;
}
let delta = value - self.min_value;
let delta_u64 = delta.to_u64();
let bucket_id: usize = self.divider.divide(delta_u64) as usize;
if bucket_id < self.counts.len() {
self.counts[bucket_id] += 1;
}
}
fn harvest(self) -> Vec<u64> {
self.counts
}
}
pub struct SegmentHistogramCollector {
histogram_computer: HistogramComputer,
ff_reader: FastFieldReader<u64>,
}
impl SegmentCollector for SegmentHistogramCollector {
type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) {
let value = self.ff_reader.get(doc);
self.histogram_computer.add_value(value);
}
fn harvest(self) -> Self::Fruit {
self.histogram_computer.harvest()
}
}
impl Collector for HistogramCollector {
type Fruit = Vec<u64>;
type Child = SegmentHistogramCollector;
fn for_segment(
&self,
_segment_local_id: crate::SegmentLocalId,
segment: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let ff_reader = segment.fast_fields().u64_lenient(self.field)?;
Ok(SegmentHistogramCollector {
histogram_computer: HistogramComputer {
counts: vec![0; self.num_buckets],
min_value: self.min_value,
divider: self.divider,
},
ff_reader,
})
}
fn requires_scoring(&self) -> bool {
false
}
fn merge_fruits(&self, child_histograms: Vec<Vec<u64>>) -> crate::Result<Vec<u64>> {
Ok(add_vecs(child_histograms, self.num_buckets))
}
}
pub fn add_arrays_into(acc: &mut [u64], add: &[u64]) {
assert_eq!(acc.len(), add.len());
for (dest_bucket, bucket_count) in acc.iter_mut().zip(add) {
*dest_bucket += bucket_count;
}
}
fn add_vecs(mut vals_list: Vec<Vec<u64>>, len: usize) -> Vec<u64> {
let mut acc = vals_list.pop().unwrap_or_else(|| vec![0u64; len]);
assert_eq!(acc.len(), len);
for vals in vals_list {
add_arrays_into(&mut acc, &vals);
}
acc
}
#[cfg(test)]
mod tests {
use super::{add_vecs, HistogramComputer, HistogramCollector};
use fastdivide::DividerU64;
use query::AllQuery;
use crate::chrono::{TimeZone, Utc};
use crate::schema::{Schema, FAST};
use crate::{Index, doc, query};
#[test]
fn test_add_histograms_simple() {
assert_eq!(
add_vecs(vec![vec![1, 0, 3], vec![11, 2, 3], vec![0, 0, 1]], 3),
vec![12, 2, 7]
)
}
#[test]
fn test_add_histograms_empty() {
assert_eq!(add_vecs(vec![], 3), vec![0, 0, 0])
}
#[test]
fn test_histogram_builder_simple() {
// [1..3)
// [3..5)
// ..
// [9..11)
let mut histogram_computer = HistogramComputer {
counts: vec![0; 5],
min_value: 1,
divider: DividerU64::divide_by(2),
};
histogram_computer.add_value(1);
histogram_computer.add_value(7);
assert_eq!(histogram_computer.harvest(), vec![1, 0, 0, 1, 0]);
}
#[test]
fn test_histogram_too_low_is_ignored() {
let mut histogram_computer = HistogramComputer {
counts: vec![0; 5],
min_value: 2,
divider: DividerU64::divide_by(2),
};
histogram_computer.add_value(0);
assert_eq!(histogram_computer.harvest(), vec![0, 0, 0, 0, 0]);
}
#[test]
fn test_histogram_too_high_is_ignored() {
let mut histogram_computer = HistogramComputer {
counts: vec![0u64; 5],
min_value: 0,
divider: DividerU64::divide_by(2),
};
histogram_computer.add_value(10);
assert_eq!(histogram_computer.harvest(), vec![0, 0, 0, 0, 0]);
}
#[test]
fn test_no_segments() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let val_field = schema_builder.add_u64_field("val_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index.reader()?;
let searcher = reader.searcher();
let all_query = AllQuery;
let histogram_collector = HistogramCollector::new(val_field, 0u64, 2, 5);
let histogram = searcher.search(&all_query, &histogram_collector)?;
assert_eq!(histogram, vec![0; 5]);
Ok(())
}
#[test]
fn test_histogram_i64() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let val_field = schema_builder.add_i64_field("val_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(val_field=>12i64));
writer.add_document(doc!(val_field=>-30i64));
writer.add_document(doc!(val_field=>-12i64));
writer.add_document(doc!(val_field=>-10i64));
writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let all_query = AllQuery;
let histogram_collector = HistogramCollector::new(val_field, -20i64, 10u64, 4);
let histogram = searcher.search(&all_query, &histogram_collector)?;
assert_eq!(histogram, vec![1, 1, 0, 1]);
Ok(())
}
#[test]
fn test_histogram_merge() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let val_field = schema_builder.add_i64_field("val_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(val_field=>12i64));
writer.commit()?;
writer.add_document(doc!(val_field=>-30i64));
writer.commit()?;
writer.add_document(doc!(val_field=>-12i64));
writer.commit()?;
writer.add_document(doc!(val_field=>-10i64));
writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let all_query = AllQuery;
let histogram_collector = HistogramCollector::new(val_field, -20i64, 10u64, 4);
let histogram = searcher.search(&all_query, &histogram_collector)?;
assert_eq!(histogram, vec![1, 1, 0, 1]);
Ok(())
}
#[test]
fn test_histogram_dates() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)));
writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)));
writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)));
writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let all_query = AllQuery;
let week_histogram_collector = HistogramCollector::new(
date_field,
Utc.ymd(1980, 1, 1).and_hms(0, 0, 0),
3600 * 24 * 365, // it is just for a unit test... sorry leap years.
10,
);
let week_histogram = searcher.search(&all_query, &week_histogram_collector)?;
assert_eq!(week_histogram, vec![0, 0, 1, 1, 0, 0, 1, 0, 0, 0]);
Ok(())
}
}

View File

@@ -93,6 +93,9 @@ use downcast_rs::impl_downcast;
mod count_collector;
pub use self::count_collector::Count;
mod histogram_collector;
pub use histogram_collector::HistogramCollector;
mod multi_collector;
pub use self::multi_collector::MultiCollector;