From dd028841e88e245928be5201ec12f7efbc3acd9a Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 3 Feb 2018 00:14:54 +0900 Subject: [PATCH] Added documentation / test and change the contract of .add_facet() --- src/collector/facet_collector.rs | 101 ++++++++++++++++++------------- src/schema/facet.rs | 19 ++++++ 2 files changed, 78 insertions(+), 42 deletions(-) diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 1fe4210f0..26e14d14f 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -14,10 +14,8 @@ use std::collections::BTreeSet; use termdict::TermMerger; use postings::SkipResult; use std::{u64, usize}; -use schema::FACET_SEP_BYTE; use std::iter::Peekable; - use DocId; use Result; use Score; @@ -93,7 +91,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize { /// (e.g. `/category/fiction`, `/category/biography`, `/category/personal_development`). /// /// Once collection is finished, you can harvest its results in the form -/// of a `FacetCounts` object, and extract your facet counts from it. +/// of a `FacetCounts` object, and extract your face t counts from it. /// /// This implementation assumes you are working with a number of facets that /// is much hundreds of time lower than your number of documents. @@ -204,7 +202,6 @@ fn facet_depth(facet_bytes: &[u8]) -> usize { /// Ok(()) /// } /// ``` - pub struct FacetCollector { facet_ords: Vec, field: Field, @@ -218,14 +215,14 @@ pub struct FacetCollector { // collapse facet_id -> facet_ord current_collapse_facet_ords: Vec, - collapse: BTreeSet>, + facets: BTreeSet } -fn skip<'a, I: Iterator>>(target: &[u8], collapse_it: &mut Peekable) -> SkipResult { +fn skip<'a, I: Iterator>(target: &[u8], collapse_it: &mut Peekable) -> SkipResult { loop { match collapse_it.peek() { Some(facet_bytes) => { - match facet_bytes[..].cmp(&target) { + match facet_bytes.encoded_bytes().cmp(&target) { Ordering::Less => {} Ordering::Greater => { return SkipResult::OverStep; @@ -251,51 +248,49 @@ impl FacetCollector { /// This function does not check whether the field /// is of the proper type. pub fn for_field(field: Field) -> FacetCollector { - let mut facet_collector = FacetCollector { + FacetCollector { facet_ords: Vec::with_capacity(255), - field: field, - ff_reader: None, segment_counters: Vec::new(), - collapse: BTreeSet::new(), + field, + ff_reader: None, + facets: BTreeSet::new(), current_segment_collapse_mapping: Vec::new(), current_collapse_facet_ords: Vec::new(), current_segment_counts: Vec::new(), - }; - facet_collector.add_facet(Facet::from("/")); - facet_collector + } } + + /// Adds a facet that we want to record counts + /// + /// Adding facet `Facet::from("/country")` for instance, + /// will record the counts of all of the direct children of the facet country + /// (e.g. `/country/FR`, `/country/UK`). + /// + /// Adding two facets within which one is the prefix of the other is forbidden. + /// If you need the correct number of unique documents for two such facets, + /// just add them in separate `FacetCollector`. pub fn add_facet(&mut self, facet_from: T) where Facet: From { let facet = Facet::from(facet_from); - let facet_bytes: &[u8] = facet.encoded_bytes(); - self.collapse.remove(&facet_bytes[..0]); - for pos in facet_bytes.iter() - .cloned() - .position(|b| b == FACET_SEP_BYTE) { - self.collapse.remove(&facet_bytes[..pos]); - } - self.collapse.insert(facet_bytes.to_owned()); - } - - fn finalize_segment(&mut self) { - if self.ff_reader.is_some() { - self.segment_counters.push( - SegmentFacetCounter { - facet_reader: unsafe { self.ff_reader.take().unwrap().into_inner() }, - facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()), - facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()), - } + for old_facet in &self.facets { + assert!( + !old_facet.is_prefix_of(&facet), + "Tried to add a facet which is a descendant of an already added facet."); + assert!( + !facet.is_prefix_of(&old_facet), + "Tried to add a facet which is an ancestor of an already added facet." ); } + self.facets.insert(facet); } fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) { self.current_segment_collapse_mapping.clear(); self.current_collapse_facet_ords.clear(); self.current_segment_counts.clear(); - let mut collapse_facet_it = self.collapse.iter().peekable(); + let mut collapse_facet_it = self.facets.iter().peekable(); self.current_collapse_facet_ords.push(0); let mut facet_streamer = facet_reader .facet_dict() @@ -338,6 +333,18 @@ impl FacetCollector { } } + fn finalize_segment(&mut self) { + if self.ff_reader.is_some() { + self.segment_counters.push( + SegmentFacetCounter { + facet_reader: unsafe { self.ff_reader.take().unwrap().into_inner() }, + facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()), + facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()), + } + ); + } + } + /// Returns the results of the collection. /// /// This method does not just return the counters, @@ -392,7 +399,7 @@ impl FacetCollector { } } FacetCounts { - facet_counts: facet_counts + facet_counts } } } @@ -470,10 +477,7 @@ impl FacetCounts { let mut it = self.get(facet); for (ref facet, count) in (&mut it).take(k) { - heap.push(Hit { - count: count, - facet: facet - }); + heap.push(Hit { count, facet }); } let mut lowest_count: u64 = heap.peek() @@ -483,10 +487,7 @@ impl FacetCounts { if count > lowest_count { lowest_count = count; if let Some(mut head) = heap.peek_mut() { - *head = Hit { - count: count, - facet: facet - }; + *head = Hit { count, facet }; } } } @@ -508,6 +509,7 @@ mod tests { use query::AllQuery; use super::{FacetCollector, FacetCounts}; use std::iter; + use schema::Field; use rand::{thread_rng, Rng}; #[test] @@ -561,6 +563,21 @@ mod tests { } } + #[test] + #[should_panic(expected="Tried to add a facet which is a descendant of an already added facet.")] + fn test_misused_facet_collector() { + let mut facet_collector = FacetCollector::for_field(Field(0)); + facet_collector.add_facet(Facet::from("/country")); + facet_collector.add_facet(Facet::from("/country/europe")); + } + + #[test] + fn test_non_used_facet_collector() { + let mut facet_collector = FacetCollector::for_field(Field(0)); + facet_collector.add_facet(Facet::from("/country")); + facet_collector.add_facet(Facet::from("/countryeurope")); + } + #[test] fn test_facet_collector_topk() { let mut schema_builder = SchemaBuilder::new(); diff --git a/src/schema/facet.rs b/src/schema/facet.rs index d8f61eda9..9c67fb8bb 100644 --- a/src/schema/facet.rs +++ b/src/schema/facet.rs @@ -2,6 +2,7 @@ use std::fmt::{self, Display, Debug, Formatter}; use std::str; use std::io::{self, Read, Write}; use regex::Regex; +use std::borrow::Borrow; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use std::borrow::Cow; use common::BinarySerializable; @@ -93,8 +94,26 @@ impl Facet { pub(crate) fn inner_buffer_mut(&mut self) -> &mut Vec { &mut self.0 } + + + /// Returns `true` iff other is a subfacet of `self`. + pub fn is_prefix_of(&self, other: &Facet) -> bool { + let self_bytes: &[u8] = self.encoded_bytes(); + let other_bytes: &[u8] = other.encoded_bytes(); + if self_bytes.len() < other_bytes.len() { + if other_bytes.starts_with(self_bytes) { + return other_bytes[self_bytes.len()] == 0u8; + } + } + false + } } +impl Borrow<[u8]> for Facet { + fn borrow(&self) -> &[u8] { + self.encoded_bytes() + } +} impl<'a, T: ?Sized + AsRef> From<&'a T> for Facet {