diff --git a/src/lib.rs b/src/lib.rs index 46f537067..2746040a4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,6 +5,7 @@ #![feature(optin_builtin_traits)] #![feature(conservative_impl_trait)] #![feature(integer_atomics)] +#![feature(drain_filter)] #![cfg_attr(test, feature(test))] #![cfg_attr(test, feature(iterator_step_by))] #![doc(test(attr(allow(unused_variables), deny(warnings))))] @@ -12,6 +13,7 @@ #![allow(new_without_default)] #![warn(missing_docs)] + //! # `tantivy` //! //! Tantivy is a search engine library. diff --git a/src/postings/intersection.rs b/src/postings/intersection.rs index 5234f51c0..f1681b7b5 100644 --- a/src/postings/intersection.rs +++ b/src/postings/intersection.rs @@ -2,6 +2,7 @@ use postings::DocSet; use postings::SkipResult; use DocId; + /// Creates a `DocSet` that iterator through the intersection of two `DocSet`s. pub struct IntersectionDocSet { docsets: Vec, diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 972d38aae..18a536181 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -15,6 +15,7 @@ mod term_info; mod vec_postings; mod segment_postings; mod intersection; +mod union; mod docset; pub use self::docset::{DocSet, SkipResult}; @@ -30,6 +31,8 @@ pub use self::vec_postings::VecPostings; pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings}; pub use self::intersection::IntersectionDocSet; +pub use self::union::UnionDocSet; + pub use common::HasLen; pub(crate) type UnorderedTermId = usize; diff --git a/src/postings/union.rs b/src/postings/union.rs new file mode 100644 index 000000000..5ca471c8f --- /dev/null +++ b/src/postings/union.rs @@ -0,0 +1,152 @@ +use postings::DocSet; +use postings::SkipResult; +use common::TinySet; +use DocId; + + +const HORIZON_NUM_TINYBITSETS: usize = 1_024; +const HORIZON: usize = 64 * HORIZON_NUM_TINYBITSETS; + +/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s. +pub struct UnionDocSet { + docsets: Vec, + bitsets: Box<[u64; HORIZON_NUM_TINYBITSETS]>, + cursor: usize, + offset: DocId, + doc: DocId, +} + +impl From> for UnionDocSet { + fn from(docsets: Vec) -> UnionDocSet { + let non_empty_docsets: Vec = + docsets + .into_iter() + .flat_map(|mut docset| { + if docset.advance() { + Some(docset) + } else { + None + } + }) + .collect(); + UnionDocSet { + docsets: non_empty_docsets, + bitsets: Box::new([0u64; HORIZON_NUM_TINYBITSETS]), + cursor: HORIZON_NUM_TINYBITSETS, + offset: 0, + doc: 0 + } + } +} + + +fn refill(docsets: &mut Vec, bitsets: &mut [u64; HORIZON_NUM_TINYBITSETS], min_doc: DocId) { + docsets + .drain_filter(|docset| { + let horizon = min_doc + HORIZON_NUM_TINYBITSETS as u32; + loop { + let doc = docset.doc(); + if doc >= horizon { + return false; + } + // add this document + let delta = doc - min_doc; + bitsets[(delta / 64) as usize] |= 1 << (delta % 64); + if !docset.advance() { + // remove the docset, it has been entirely consumed. + return true; + } + } + }); +} + +impl UnionDocSet { + fn refill(&mut self) -> bool { + if let Some(min_doc) = self.docsets + .iter_mut() + .map(|docset| docset.doc()) + .min() { + self.offset = min_doc; + self.cursor = 0; + refill(&mut self.docsets, &mut *self.bitsets, min_doc); + self.advance(); + true + } else { + false + } + } +} + +impl DocSet for UnionDocSet { + + fn advance(&mut self) -> bool { + while self.cursor < HORIZON_NUM_TINYBITSETS { + if let Some(val) = self.bitsets[self.cursor].pop_lowest() { + self.doc = self.offset + val + (self.cursor as u32) * 64; + return true; + } else { + self.cursor += 1; + } + } + self.refill() + } + + fn doc(&self) -> DocId { + self.doc + } + + fn size_hint(&self) -> u32 { + 0u32 + } + + fn skip_next(&mut self, target: DocId) -> SkipResult { + let mut reached = false; + self.docsets + .drain_filter(|docset| { + match docset.skip_next(target) { + SkipResult::End => true, + SkipResult::Reached => { + reached = true; + false + }, + SkipResult::OverStep => false + } + }); + if self.docsets.is_empty() { + SkipResult::End + } else { + if reached { + SkipResult::Reached + } else { + SkipResult::OverStep + } + } + } +} + + +#[cfg(test)] +mod tests { + + use super::UnionDocSet; + use postings::VecPostings; + use postings::DocSet; + + #[test] + fn test_union() { + let mut union = UnionDocSet::from( + vec!( + VecPostings::from(vec![1, 3333, 100000000u32]), + VecPostings::from(vec![1,2, 100000000u32]), + VecPostings::from(vec![1,2, 100000000u32]), + VecPostings::from(vec![]) + ) + ); + let mut docsets = vec![]; + while union.advance() { + docsets.push(union.doc()); + } + assert_eq!(&docsets, &[1u32, 2u32, 3333u32, 100000000u32]); + } + +} \ No newline at end of file diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index 3cd3290b5..a61c55437 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -51,6 +51,9 @@ impl Query for BooleanQuery { fn disable_scoring(&mut self) { self.scoring_disabled = true; + for &mut (_, ref mut subquery) in &mut self.subqueries { + subquery.disable_scoring(); + } } } diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 148eb65a5..9bf35ab51 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -1,9 +1,12 @@ use query::Weight; use core::SegmentReader; +use postings::{IntersectionDocSet, UnionDocSet}; +use std::collections::HashMap; use query::EmptyScorer; use query::Scorer; use super::BooleanScorer; use query::OccurFilter; +use query::ConstScorer; use query::Occur; use Result; @@ -33,19 +36,54 @@ impl Weight for BooleanWeight { weight.scorer(reader) } } else { - let sub_scorers: Vec> = self.weights - .iter() - .map(|&(_, ref weight)| weight) - .map(|weight| weight.scorer(reader)) - .collect::>()?; - let occurs: Vec = self.weights - .iter() - .map(|&(ref occur, _)| *occur) - .collect(); - let occur_filter = OccurFilter::new(&occurs); - let boolean_scorer = BooleanScorer::new(sub_scorers, occur_filter); - Ok(box boolean_scorer) - } + if self.scoring_disabled { + let mut per_occur_scorers = HashMap::new(); + for &(ref occur, ref subweight) in &self.weights { + per_occur_scorers + .entry(occur) + .or_insert_with(Vec::new) + .push(subweight.scorer(reader)?); + } + let mut result_scorer_opt: Option> = per_occur_scorers + .remove(&Occur::Should) + .map(|subscorers| { + assert!(!subscorers.is_empty()); + if subscorers.len() == 1 { + subscorers + .into_iter() + .next() + .unwrap() //< we checked the size beforehands + } else { + box ConstScorer::new(UnionDocSet::from(subscorers)) + } + }); + if let Some(mut subscorers) = per_occur_scorers.remove(&Occur::Must) { + if let Some(should_query) = result_scorer_opt { + subscorers.push(should_query); + } + let intersection_docset = IntersectionDocSet::from(subscorers); + result_scorer_opt = Some(box ConstScorer::new(intersection_docset)); + } + if let Some(result_scorer) = result_scorer_opt { + Ok(result_scorer) + } else { + Ok(box EmptyScorer) + } + } else { + let sub_scorers: Vec> = self.weights + .iter() + .map(|&(_, ref weight)| weight) + .map(|weight| weight.scorer(reader)) + .collect::>()?; + let occurs: Vec = self.weights + .iter() + .map(|&(ref occur, _)| *occur) + .collect(); + let occur_filter = OccurFilter::new(&occurs); + let boolean_scorer = BooleanScorer::new(sub_scorers, occur_filter); + Ok(box boolean_scorer) + } + } } } diff --git a/src/query/occur.rs b/src/query/occur.rs index 7d0ee2f02..9bcf02bc2 100644 --- a/src/query/occur.rs +++ b/src/query/occur.rs @@ -1,6 +1,6 @@ /// Defines whether a term in a query must be present, /// should be present or must not be present. -#[derive(Debug, Clone, Copy, Eq, PartialEq)] +#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)] pub enum Occur { /// For a given document to be considered for scoring, /// at least one of the document with the Should or the Must diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index ece4994d9..033409b8c 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -128,6 +128,24 @@ impl DocSet for PhraseScorer { fn size_hint(&self) -> u32 { self.intersection_docset.size_hint() } + + fn skip_next(&mut self, target: DocId) -> SkipResult { + if self.intersection_docset.skip_next(target) == SkipResult::End { + SkipResult::End + } else if self.phrase_match() { + if self.doc() == target { + SkipResult::Reached + } else { + SkipResult::OverStep + } + } else { + if self.advance() { + SkipResult::OverStep + } else { + SkipResult::End + } + } + } } impl Scorer for PhraseScorer {