From 272589a381986b229882ab850cc70cb53193dac2 Mon Sep 17 00:00:00 2001 From: Maciej Dziardziel Date: Thu, 6 Apr 2017 17:36:08 +0100 Subject: [PATCH 1/2] faceting for fast numerical fields --- src/collector/facet_collector.rs | 173 +++++++++++++++++++++++++++++++ src/collector/mod.rs | 3 + 2 files changed, 176 insertions(+) create mode 100644 src/collector/facet_collector.rs diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs new file mode 100644 index 000000000..e633ed80f --- /dev/null +++ b/src/collector/facet_collector.rs @@ -0,0 +1,173 @@ +use std::cmp::Eq; +use std::collections::HashMap; +use std::hash::Hash; + +use collector::Collector; +use fastfield::{FastFieldReader, I64FastFieldReader, U64FastFieldReader}; +use schema::Field; + +use DocId; +use Result; +use Score; +use SegmentReader; +use SegmentLocalId; + + +/// faceting for i64/u64 fast field +pub struct FastFieldValueFacet + where T: FastFieldReader, + T::ValueType: Eq + Hash +{ + counters: HashMap, + field: Field, + ff_reader: Option, +} + + +impl FastFieldValueFacet + where T: FastFieldReader, + T::ValueType: Eq + Hash +{ + fn new(field: Field) -> FastFieldValueFacet { + FastFieldValueFacet { + counters: HashMap::new(), + field: field, + ff_reader: None, + } + } +} + + +impl Collector for FastFieldValueFacet + where T: FastFieldReader, + T::ValueType: Eq + Hash +{ + fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> { + self.ff_reader = Some(try!(reader.get_fast_field_reader(self.field))); + Ok(()) + } + + fn collect(&mut self, doc: DocId, _: Score) { + let val = self.ff_reader.as_ref().unwrap().get(doc); + *(self.counters.entry(val).or_insert(0)) += 1; + } +} + + +enum FacedType { + FastFieldI64(FastFieldValueFacet), + FastFieldU64(FastFieldValueFacet), +} + + +pub struct FacetCollector { + facets: Vec, +} + + +impl FacetCollector { + fn new(facets: Vec) -> FacetCollector { + FacetCollector { facets: facets } + } +} + + +impl Collector for FacetCollector { + fn set_segment(&mut self, segment_id: SegmentLocalId, reader: &SegmentReader) -> Result<()> { + for facet_type in self.facets.iter_mut() { + match facet_type { + &mut FacedType::FastFieldI64(ref mut fast_field_value_facet) => { + fast_field_value_facet.set_segment(segment_id, reader) + } + &mut FacedType::FastFieldU64(ref mut fast_field_value_facet) => { + fast_field_value_facet.set_segment(segment_id, reader) + } + }; + } + Ok(()) + } + + fn collect(&mut self, doc: DocId, score: Score) { + for facet_type in self.facets.iter_mut() { + match facet_type { + &mut FacedType::FastFieldI64(ref mut fast_field_value_facet) => { + fast_field_value_facet.collect(doc, score) + } + &mut FacedType::FastFieldU64(ref mut fast_field_value_facet) => { + fast_field_value_facet.collect(doc, score) + } + } + } + } +} + + +#[cfg(test)] +mod tests { + + use super::*; + use collector::FacetCollector; + use query::QueryParser; + use schema::{self, Document}; + use Index; + + #[test] + // create 10 documents, set num field value to 0 or 1 for even/odd ones + // make sure we have facet counters correctly filled + fn test_facet_collector_results() { + let mut schema_builder = schema::SchemaBuilder::new(); + let num_field_i64 = + schema_builder.add_i64_field("num_i64", + schema::IntOptions::default().set_fast().set_indexed()); + let num_field_u64 = + schema_builder.add_u64_field("num_u64", + schema::IntOptions::default().set_fast().set_indexed()); + + let text_field = schema_builder.add_text_field( + "text", + schema::TextOptions::default() + .set_indexing_options(schema::TextIndexingOptions::Untokenized) + ); + + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + + { + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + { + for i in 1..11 { + let mut doc = Document::default(); + doc.add_i64(num_field_i64, i % 2); + doc.add_u64(num_field_u64, (i % 2) as u64); + doc.add_text(text_field, "text"); + index_writer.add_document(doc); + } + } + assert_eq!(index_writer.commit().unwrap(), 10u64); + } + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let ffvf_i64 = FastFieldValueFacet::new(num_field_i64); + let ffvf_u64 = FastFieldValueFacet::new(num_field_u64); + let mut facet_collector = FacetCollector::new(vec![FacedType::FastFieldI64(ffvf_i64), + FacedType::FastFieldU64(ffvf_u64)]); + + let query_parser = QueryParser::new(schema, vec![text_field]); + let query = query_parser.parse_query("text:text").unwrap(); + query.search(&searcher, &mut facet_collector).unwrap(); + for facet in facet_collector.facets { + match facet { + FacedType::FastFieldI64(ffvf) => { + assert_eq!(ffvf.counters[&0], 5); + assert_eq!(ffvf.counters[&1], 5); + } + FacedType::FastFieldU64(ffvf) => { + assert_eq!(ffvf.counters[&0], 5); + assert_eq!(ffvf.counters[&1], 5); + } + + } + } + } +} diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 72d5797ff..27435592d 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -13,6 +13,9 @@ pub use self::multi_collector::MultiCollector; mod top_collector; pub use self::top_collector::TopCollector; +mod facet_collector; +pub use self::facet_collector::FacetCollector; + mod chained_collector; pub use self::chained_collector::chain; From 02d992324a25d1380d0d862b9ebcf0786af903da Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 21 May 2017 22:56:43 +0900 Subject: [PATCH 2/2] simplified facets. --- src/collector/facet_collector.rs | 138 +++++++++---------------------- src/postings/mod.rs | 4 +- 2 files changed, 44 insertions(+), 98 deletions(-) diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index e633ed80f..fbd613bac 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use std::hash::Hash; use collector::Collector; -use fastfield::{FastFieldReader, I64FastFieldReader, U64FastFieldReader}; +use fastfield::FastFieldReader; use schema::Field; use DocId; @@ -13,8 +13,8 @@ use SegmentReader; use SegmentLocalId; -/// faceting for i64/u64 fast field -pub struct FastFieldValueFacet +/// Facet collector for i64/u64 fast field +pub struct FacetCollector where T: FastFieldReader, T::ValueType: Eq + Hash { @@ -24,12 +24,13 @@ pub struct FastFieldValueFacet } -impl FastFieldValueFacet +impl FacetCollector where T: FastFieldReader, T::ValueType: Eq + Hash { - fn new(field: Field) -> FastFieldValueFacet { - FastFieldValueFacet { + /// Creates a new facet collector for aggregating a given field. + pub fn new(field: Field) -> FacetCollector { + FacetCollector { counters: HashMap::new(), field: field, ff_reader: None, @@ -38,109 +39,57 @@ impl FastFieldValueFacet } -impl Collector for FastFieldValueFacet +impl Collector for FacetCollector where T: FastFieldReader, T::ValueType: Eq + Hash { fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> { - self.ff_reader = Some(try!(reader.get_fast_field_reader(self.field))); + self.ff_reader = Some(reader.get_fast_field_reader(self.field)?); Ok(()) } fn collect(&mut self, doc: DocId, _: Score) { - let val = self.ff_reader.as_ref().unwrap().get(doc); + let val = self.ff_reader + .as_ref() + .expect("collect() was called before set_segment. This should never happen.") + .get(doc); *(self.counters.entry(val).or_insert(0)) += 1; } } -enum FacedType { - FastFieldI64(FastFieldValueFacet), - FastFieldU64(FastFieldValueFacet), -} - - -pub struct FacetCollector { - facets: Vec, -} - - -impl FacetCollector { - fn new(facets: Vec) -> FacetCollector { - FacetCollector { facets: facets } - } -} - - -impl Collector for FacetCollector { - fn set_segment(&mut self, segment_id: SegmentLocalId, reader: &SegmentReader) -> Result<()> { - for facet_type in self.facets.iter_mut() { - match facet_type { - &mut FacedType::FastFieldI64(ref mut fast_field_value_facet) => { - fast_field_value_facet.set_segment(segment_id, reader) - } - &mut FacedType::FastFieldU64(ref mut fast_field_value_facet) => { - fast_field_value_facet.set_segment(segment_id, reader) - } - }; - } - Ok(()) - } - - fn collect(&mut self, doc: DocId, score: Score) { - for facet_type in self.facets.iter_mut() { - match facet_type { - &mut FacedType::FastFieldI64(ref mut fast_field_value_facet) => { - fast_field_value_facet.collect(doc, score) - } - &mut FacedType::FastFieldU64(ref mut fast_field_value_facet) => { - fast_field_value_facet.collect(doc, score) - } - } - } - } -} - #[cfg(test)] mod tests { - use super::*; - use collector::FacetCollector; + use collector::{chain, FacetCollector}; use query::QueryParser; - use schema::{self, Document}; + use fastfield::{I64FastFieldReader, U64FastFieldReader}; + use schema::{self, Document, FieldValue, FAST, STRING}; use Index; #[test] // create 10 documents, set num field value to 0 or 1 for even/odd ones // make sure we have facet counters correctly filled fn test_facet_collector_results() { + let mut schema_builder = schema::SchemaBuilder::new(); - let num_field_i64 = - schema_builder.add_i64_field("num_i64", - schema::IntOptions::default().set_fast().set_indexed()); - let num_field_u64 = - schema_builder.add_u64_field("num_u64", - schema::IntOptions::default().set_fast().set_indexed()); - - let text_field = schema_builder.add_text_field( - "text", - schema::TextOptions::default() - .set_indexing_options(schema::TextIndexingOptions::Untokenized) - ); - + let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST); + let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST); + let text_field = schema_builder.add_text_field("text", STRING); let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { - for i in 1..11 { - let mut doc = Document::default(); - doc.add_i64(num_field_i64, i % 2); - doc.add_u64(num_field_u64, (i % 2) as u64); - doc.add_text(text_field, "text"); - index_writer.add_document(doc); + for i in 0u64..10u64 { + index_writer.add_document(doc!( + num_field_i64 => ((i as i64) % 3i64) as i64, + num_field_u64 => (i % 2u64) as u64, + text_field => "text" + )); } } assert_eq!(index_writer.commit().unwrap(), 10u64); @@ -148,26 +97,21 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); - let ffvf_i64 = FastFieldValueFacet::new(num_field_i64); - let ffvf_u64 = FastFieldValueFacet::new(num_field_u64); - let mut facet_collector = FacetCollector::new(vec![FacedType::FastFieldI64(ffvf_i64), - FacedType::FastFieldU64(ffvf_u64)]); + let mut ffvf_i64: FacetCollector = FacetCollector::new(num_field_i64); + let mut ffvf_u64: FacetCollector = FacetCollector::new(num_field_u64); - let query_parser = QueryParser::new(schema, vec![text_field]); - let query = query_parser.parse_query("text:text").unwrap(); - query.search(&searcher, &mut facet_collector).unwrap(); - for facet in facet_collector.facets { - match facet { - FacedType::FastFieldI64(ffvf) => { - assert_eq!(ffvf.counters[&0], 5); - assert_eq!(ffvf.counters[&1], 5); - } - FacedType::FastFieldU64(ffvf) => { - assert_eq!(ffvf.counters[&0], 5); - assert_eq!(ffvf.counters[&1], 5); - } - - } + { + // perform the query + let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64); + let query_parser = QueryParser::new(schema, vec![text_field]); + let query = query_parser.parse_query("text:text").unwrap(); + query.search(&searcher, &mut facet_collectors).unwrap(); } + + assert_eq!(ffvf_u64.counters[&0], 5); + assert_eq!(ffvf_u64.counters[&1], 5); + assert_eq!(ffvf_i64.counters[&0], 4); + assert_eq!(ffvf_i64.counters[&1], 3); + } } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index db2bf8e3c..a266f717f 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -522,7 +522,9 @@ mod tests { let segment_reader = searcher.segment_reader(0); b.iter(|| { let n: u32 = test::black_box(17); - let mut segment_postings = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap(); + let mut segment_postings = segment_reader + .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) + .unwrap(); let mut s = 0u32; while segment_postings.advance() { s += (segment_postings.doc() & n) % 1024;