mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
Added documentation / test and change the contract of .add_facet()
This commit is contained in:
@@ -14,10 +14,8 @@ use std::collections::BTreeSet;
|
||||
use termdict::TermMerger;
|
||||
use postings::SkipResult;
|
||||
use std::{u64, usize};
|
||||
use schema::FACET_SEP_BYTE;
|
||||
use std::iter::Peekable;
|
||||
|
||||
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
@@ -93,7 +91,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// (e.g. `/category/fiction`, `/category/biography`, `/category/personal_development`).
|
||||
///
|
||||
/// Once collection is finished, you can harvest its results in the form
|
||||
/// of a `FacetCounts` object, and extract your facet counts from it.
|
||||
/// of a `FacetCounts` object, and extract your face t counts from it.
|
||||
///
|
||||
/// This implementation assumes you are working with a number of facets that
|
||||
/// is much hundreds of time lower than your number of documents.
|
||||
@@ -204,7 +202,6 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
|
||||
pub struct FacetCollector {
|
||||
facet_ords: Vec<u64>,
|
||||
field: Field,
|
||||
@@ -218,14 +215,14 @@ pub struct FacetCollector {
|
||||
// collapse facet_id -> facet_ord
|
||||
current_collapse_facet_ords: Vec<u64>,
|
||||
|
||||
collapse: BTreeSet<Vec<u8>>,
|
||||
facets: BTreeSet<Facet>
|
||||
}
|
||||
|
||||
fn skip<'a, I: Iterator<Item=&'a Vec<u8>>>(target: &[u8], collapse_it: &mut Peekable<I>) -> SkipResult {
|
||||
fn skip<'a, I: Iterator<Item=&'a Facet>>(target: &[u8], collapse_it: &mut Peekable<I>) -> SkipResult {
|
||||
loop {
|
||||
match collapse_it.peek() {
|
||||
Some(facet_bytes) => {
|
||||
match facet_bytes[..].cmp(&target) {
|
||||
match facet_bytes.encoded_bytes().cmp(&target) {
|
||||
Ordering::Less => {}
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
@@ -251,51 +248,49 @@ impl FacetCollector {
|
||||
/// This function does not check whether the field
|
||||
/// is of the proper type.
|
||||
pub fn for_field(field: Field) -> FacetCollector {
|
||||
let mut facet_collector = FacetCollector {
|
||||
FacetCollector {
|
||||
facet_ords: Vec::with_capacity(255),
|
||||
field: field,
|
||||
ff_reader: None,
|
||||
segment_counters: Vec::new(),
|
||||
collapse: BTreeSet::new(),
|
||||
field,
|
||||
ff_reader: None,
|
||||
facets: BTreeSet::new(),
|
||||
|
||||
current_segment_collapse_mapping: Vec::new(),
|
||||
current_collapse_facet_ords: Vec::new(),
|
||||
current_segment_counts: Vec::new(),
|
||||
};
|
||||
facet_collector.add_facet(Facet::from("/"));
|
||||
facet_collector
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Adds a facet that we want to record counts
|
||||
///
|
||||
/// Adding facet `Facet::from("/country")` for instance,
|
||||
/// will record the counts of all of the direct children of the facet country
|
||||
/// (e.g. `/country/FR`, `/country/UK`).
|
||||
///
|
||||
/// Adding two facets within which one is the prefix of the other is forbidden.
|
||||
/// If you need the correct number of unique documents for two such facets,
|
||||
/// just add them in separate `FacetCollector`.
|
||||
pub fn add_facet<T>(&mut self, facet_from: T)
|
||||
where Facet: From<T> {
|
||||
let facet = Facet::from(facet_from);
|
||||
let facet_bytes: &[u8] = facet.encoded_bytes();
|
||||
self.collapse.remove(&facet_bytes[..0]);
|
||||
for pos in facet_bytes.iter()
|
||||
.cloned()
|
||||
.position(|b| b == FACET_SEP_BYTE) {
|
||||
self.collapse.remove(&facet_bytes[..pos]);
|
||||
}
|
||||
self.collapse.insert(facet_bytes.to_owned());
|
||||
}
|
||||
|
||||
fn finalize_segment(&mut self) {
|
||||
if self.ff_reader.is_some() {
|
||||
self.segment_counters.push(
|
||||
SegmentFacetCounter {
|
||||
facet_reader: unsafe { self.ff_reader.take().unwrap().into_inner() },
|
||||
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
|
||||
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
|
||||
}
|
||||
for old_facet in &self.facets {
|
||||
assert!(
|
||||
!old_facet.is_prefix_of(&facet),
|
||||
"Tried to add a facet which is a descendant of an already added facet.");
|
||||
assert!(
|
||||
!facet.is_prefix_of(&old_facet),
|
||||
"Tried to add a facet which is an ancestor of an already added facet."
|
||||
);
|
||||
}
|
||||
self.facets.insert(facet);
|
||||
}
|
||||
|
||||
fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
|
||||
self.current_segment_collapse_mapping.clear();
|
||||
self.current_collapse_facet_ords.clear();
|
||||
self.current_segment_counts.clear();
|
||||
let mut collapse_facet_it = self.collapse.iter().peekable();
|
||||
let mut collapse_facet_it = self.facets.iter().peekable();
|
||||
self.current_collapse_facet_ords.push(0);
|
||||
let mut facet_streamer = facet_reader
|
||||
.facet_dict()
|
||||
@@ -338,6 +333,18 @@ impl FacetCollector {
|
||||
}
|
||||
}
|
||||
|
||||
fn finalize_segment(&mut self) {
|
||||
if self.ff_reader.is_some() {
|
||||
self.segment_counters.push(
|
||||
SegmentFacetCounter {
|
||||
facet_reader: unsafe { self.ff_reader.take().unwrap().into_inner() },
|
||||
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
|
||||
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the results of the collection.
|
||||
///
|
||||
/// This method does not just return the counters,
|
||||
@@ -392,7 +399,7 @@ impl FacetCollector {
|
||||
}
|
||||
}
|
||||
FacetCounts {
|
||||
facet_counts: facet_counts
|
||||
facet_counts
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -470,10 +477,7 @@ impl FacetCounts {
|
||||
let mut it = self.get(facet);
|
||||
|
||||
for (ref facet, count) in (&mut it).take(k) {
|
||||
heap.push(Hit {
|
||||
count: count,
|
||||
facet: facet
|
||||
});
|
||||
heap.push(Hit { count, facet });
|
||||
}
|
||||
|
||||
let mut lowest_count: u64 = heap.peek()
|
||||
@@ -483,10 +487,7 @@ impl FacetCounts {
|
||||
if count > lowest_count {
|
||||
lowest_count = count;
|
||||
if let Some(mut head) = heap.peek_mut() {
|
||||
*head = Hit {
|
||||
count: count,
|
||||
facet: facet
|
||||
};
|
||||
*head = Hit { count, facet };
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -508,6 +509,7 @@ mod tests {
|
||||
use query::AllQuery;
|
||||
use super::{FacetCollector, FacetCounts};
|
||||
use std::iter;
|
||||
use schema::Field;
|
||||
use rand::{thread_rng, Rng};
|
||||
|
||||
#[test]
|
||||
@@ -561,6 +563,21 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected="Tried to add a facet which is a descendant of an already added facet.")]
|
||||
fn test_misused_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
facet_collector.add_facet(Facet::from("/country/europe"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_non_used_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
facet_collector.add_facet(Facet::from("/countryeurope"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_collector_topk() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::fmt::{self, Display, Debug, Formatter};
|
||||
use std::str;
|
||||
use std::io::{self, Read, Write};
|
||||
use regex::Regex;
|
||||
use std::borrow::Borrow;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use std::borrow::Cow;
|
||||
use common::BinarySerializable;
|
||||
@@ -93,8 +94,26 @@ impl Facet {
|
||||
pub(crate) fn inner_buffer_mut(&mut self) -> &mut Vec<u8> {
|
||||
&mut self.0
|
||||
}
|
||||
|
||||
|
||||
/// Returns `true` iff other is a subfacet of `self`.
|
||||
pub fn is_prefix_of(&self, other: &Facet) -> bool {
|
||||
let self_bytes: &[u8] = self.encoded_bytes();
|
||||
let other_bytes: &[u8] = other.encoded_bytes();
|
||||
if self_bytes.len() < other_bytes.len() {
|
||||
if other_bytes.starts_with(self_bytes) {
|
||||
return other_bytes[self_bytes.len()] == 0u8;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl Borrow<[u8]> for Facet {
|
||||
fn borrow(&self) -> &[u8] {
|
||||
self.encoded_bytes()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
|
||||
|
||||
|
||||
Reference in New Issue
Block a user