Better intersection and added size_hint

This commit is contained in:
Paul Masurel
2017-05-22 22:16:09 +09:00
parent 0521844e56
commit 19c073385a
8 changed files with 91 additions and 25 deletions

View File

@@ -65,6 +65,10 @@ pub trait DocSet {
None
}
}
/// Returns a best-effort hint of the
/// length of the docset.
fn size_hint(&self) -> usize;
}
@@ -83,6 +87,11 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
let unboxed: &TDocSet = self.borrow();
unboxed.doc()
}
fn size_hint(&self) -> usize {
let unboxed: &TDocSet = self.borrow();
unboxed.size_hint()
}
}
impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
@@ -100,4 +109,9 @@ impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
let unref: &TDocSet = *self;
unref.doc()
}
fn size_hint(&self) -> usize {
let unref: &TDocSet = *self;
unref.size_hint()
}
}

View File

@@ -10,8 +10,9 @@ pub struct IntersectionDocSet<TDocSet: DocSet> {
}
impl<TDocSet: DocSet> From<Vec<TDocSet>> for IntersectionDocSet<TDocSet> {
fn from(docsets: Vec<TDocSet>) -> IntersectionDocSet<TDocSet> {
fn from(mut docsets: Vec<TDocSet>) -> IntersectionDocSet<TDocSet> {
assert!(docsets.len() >= 2);
docsets.sort_by_key(|docset| docset.size_hint());
IntersectionDocSet {
docsets: docsets,
finished: false,
@@ -31,37 +32,55 @@ impl<TDocSet: DocSet> IntersectionDocSet<TDocSet> {
impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> {
fn size_hint(&self) -> usize {
self.docsets
.iter()
.map(|docset| docset.size_hint())
.min()
.unwrap() // safe as docsets cannot be empty.
}
#[allow(never_loop)]
fn advance(&mut self) -> bool {
if self.finished {
return false;
}
let num_docsets = self.docsets.len();
let mut count_matching = 0;
let mut doc_candidate = 0;
let mut ord = 0;
loop {
let mut doc_set = &mut self.docsets[ord];
match doc_set.skip_next(doc_candidate) {
SkipResult::Reached => {
count_matching += 1;
if count_matching == num_docsets {
self.doc = doc_candidate;
return true;
let (head_arr, tail) = self.docsets.split_at_mut(1);
let head: &mut TDocSet = &mut head_arr[0];
if !head.advance() {
self.finished = true;
return false;
}
let mut doc_candidate = head.doc();
'outer: loop {
for docset in tail.iter_mut() {
match docset.skip_next(doc_candidate) {
SkipResult::Reached => {}
SkipResult::OverStep => {
doc_candidate = docset.doc();
match head.skip_next(doc_candidate) {
SkipResult::Reached => {}
SkipResult::End => {
self.finished = true;
return false;
}
SkipResult::OverStep => {
doc_candidate = head.doc();
continue 'outer;
}
}
}
SkipResult::End => {
self.finished = true;
return false;
}
}
SkipResult::End => {
self.finished = true;
return false;
}
SkipResult::OverStep => {
count_matching = 1;
doc_candidate = doc_set.doc();
}
}
ord += 1;
if ord == num_docsets {
ord = 0;
}
self.doc = doc_candidate;
return true;
}
}

View File

@@ -152,6 +152,10 @@ impl<'a> DocSet for SegmentPostings<'a> {
}
}
fn size_hint(&self) -> usize {
self.len()
}
#[inline]
fn doc(&self) -> DocId {
let docs = self.block_cursor.docs();

View File

@@ -34,6 +34,10 @@ impl DocSet for VecPostings {
fn doc(&self) -> DocId {
self.doc_ids[self.cursor.0]
}
fn size_hint(&self) -> usize {
self.len()
}
}
impl HasLen for VecPostings {

View File

@@ -93,6 +93,18 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
}
impl<TScorer: Scorer> DocSet for BooleanScorer<TScorer> {
fn size_hint(&self) -> usize {
// TODO fix this. it should be the min
// of the MUST scorer
// and the max of the SHOULD scorers.
self.scorers
.iter()
.map(|scorer| scorer.size_hint())
.max()
.unwrap()
}
fn advance(&mut self) -> bool {
loop {
self.score_combiner.clear();

View File

@@ -67,6 +67,10 @@ impl<'a> DocSet for PhraseScorer<'a> {
fn doc(&self) -> DocId {
self.intersection_docset.doc()
}
fn size_hint(&self) -> usize {
self.intersection_docset.size_hint()
}
}

View File

@@ -49,6 +49,10 @@ impl DocSet for EmptyScorer {
fn doc(&self) -> DocId {
DocId::max_value()
}
fn size_hint(&self) -> usize {
0
}
}
impl Scorer for EmptyScorer {

View File

@@ -32,6 +32,11 @@ impl<TPostings> DocSet for TermScorer<TPostings>
fn doc(&self) -> DocId {
self.postings.doc()
}
fn size_hint(&self) -> usize {
self.postings.size_hint()
}
}
impl<TPostings> Scorer for TermScorer<TPostings>