diff --git a/src/postings/docset.rs b/src/postings/docset.rs index e28319f42..ea4211a5f 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -65,6 +65,10 @@ pub trait DocSet { None } } + + /// Returns a best-effort hint of the + /// length of the docset. + fn size_hint(&self) -> usize; } @@ -83,6 +87,11 @@ impl DocSet for Box { let unboxed: &TDocSet = self.borrow(); unboxed.doc() } + + fn size_hint(&self) -> usize { + let unboxed: &TDocSet = self.borrow(); + unboxed.size_hint() + } } impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet { @@ -100,4 +109,9 @@ impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet { let unref: &TDocSet = *self; unref.doc() } + + fn size_hint(&self) -> usize { + let unref: &TDocSet = *self; + unref.size_hint() + } } diff --git a/src/postings/intersection.rs b/src/postings/intersection.rs index e4e4c2308..3f30a54c7 100644 --- a/src/postings/intersection.rs +++ b/src/postings/intersection.rs @@ -10,8 +10,9 @@ pub struct IntersectionDocSet { } impl From> for IntersectionDocSet { - fn from(docsets: Vec) -> IntersectionDocSet { + fn from(mut docsets: Vec) -> IntersectionDocSet { assert!(docsets.len() >= 2); + docsets.sort_by_key(|docset| docset.size_hint()); IntersectionDocSet { docsets: docsets, finished: false, @@ -31,37 +32,55 @@ impl IntersectionDocSet { impl DocSet for IntersectionDocSet { + fn size_hint(&self) -> usize { + self.docsets + .iter() + .map(|docset| docset.size_hint()) + .min() + .unwrap() // safe as docsets cannot be empty. + } + + #[allow(never_loop)] fn advance(&mut self) -> bool { if self.finished { return false; } - let num_docsets = self.docsets.len(); - let mut count_matching = 0; - let mut doc_candidate = 0; - let mut ord = 0; - loop { - let mut doc_set = &mut self.docsets[ord]; - match doc_set.skip_next(doc_candidate) { - SkipResult::Reached => { - count_matching += 1; - if count_matching == num_docsets { - self.doc = doc_candidate; - return true; + let (head_arr, tail) = self.docsets.split_at_mut(1); + let head: &mut TDocSet = &mut head_arr[0]; + if !head.advance() { + self.finished = true; + return false; + } + let mut doc_candidate = head.doc(); + + 'outer: loop { + + for docset in tail.iter_mut() { + match docset.skip_next(doc_candidate) { + SkipResult::Reached => {} + SkipResult::OverStep => { + doc_candidate = docset.doc(); + match head.skip_next(doc_candidate) { + SkipResult::Reached => {} + SkipResult::End => { + self.finished = true; + return false; + } + SkipResult::OverStep => { + doc_candidate = head.doc(); + continue 'outer; + } + } + } + SkipResult::End => { + self.finished = true; + return false; } } - SkipResult::End => { - self.finished = true; - return false; - } - SkipResult::OverStep => { - count_matching = 1; - doc_candidate = doc_set.doc(); - } - } - ord += 1; - if ord == num_docsets { - ord = 0; } + + self.doc = doc_candidate; + return true; } } diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index d6386d138..f42922629 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -152,6 +152,10 @@ impl<'a> DocSet for SegmentPostings<'a> { } } + fn size_hint(&self) -> usize { + self.len() + } + #[inline] fn doc(&self) -> DocId { let docs = self.block_cursor.docs(); diff --git a/src/postings/vec_postings.rs b/src/postings/vec_postings.rs index 399307cff..8c9512fb1 100644 --- a/src/postings/vec_postings.rs +++ b/src/postings/vec_postings.rs @@ -34,6 +34,10 @@ impl DocSet for VecPostings { fn doc(&self) -> DocId { self.doc_ids[self.cursor.0] } + + fn size_hint(&self) -> usize { + self.len() + } } impl HasLen for VecPostings { diff --git a/src/query/boolean_query/boolean_scorer.rs b/src/query/boolean_query/boolean_scorer.rs index 8e1bf5950..595f54219 100644 --- a/src/query/boolean_query/boolean_scorer.rs +++ b/src/query/boolean_query/boolean_scorer.rs @@ -93,6 +93,18 @@ impl BooleanScorer { } impl DocSet for BooleanScorer { + fn size_hint(&self) -> usize { + // TODO fix this. it should be the min + // of the MUST scorer + // and the max of the SHOULD scorers. + self.scorers + .iter() + .map(|scorer| scorer.size_hint()) + .max() + .unwrap() + } + + fn advance(&mut self) -> bool { loop { self.score_combiner.clear(); diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 23721037b..1726340d1 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -67,6 +67,10 @@ impl<'a> DocSet for PhraseScorer<'a> { fn doc(&self) -> DocId { self.intersection_docset.doc() } + + fn size_hint(&self) -> usize { + self.intersection_docset.size_hint() + } } diff --git a/src/query/scorer.rs b/src/query/scorer.rs index 027af82de..e3f677edf 100644 --- a/src/query/scorer.rs +++ b/src/query/scorer.rs @@ -49,6 +49,10 @@ impl DocSet for EmptyScorer { fn doc(&self) -> DocId { DocId::max_value() } + + fn size_hint(&self) -> usize { + 0 + } } impl Scorer for EmptyScorer { diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index 0819aeb58..73ea46b4b 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -32,6 +32,11 @@ impl DocSet for TermScorer fn doc(&self) -> DocId { self.postings.doc() } + + + fn size_hint(&self) -> usize { + self.postings.size_hint() + } } impl Scorer for TermScorer