Query optimization: phrase query + union

This commit is contained in:
Paul Masurel
2018-02-02 16:39:17 +09:00
parent dd8332c327
commit fb5476d5de
8 changed files with 231 additions and 14 deletions

View File

@@ -5,6 +5,7 @@
#![feature(optin_builtin_traits)]
#![feature(conservative_impl_trait)]
#![feature(integer_atomics)]
#![feature(drain_filter)]
#![cfg_attr(test, feature(test))]
#![cfg_attr(test, feature(iterator_step_by))]
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
@@ -12,6 +13,7 @@
#![allow(new_without_default)]
#![warn(missing_docs)]
//! # `tantivy`
//!
//! Tantivy is a search engine library.

View File

@@ -2,6 +2,7 @@ use postings::DocSet;
use postings::SkipResult;
use DocId;
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
pub struct IntersectionDocSet<TDocSet: DocSet> {
docsets: Vec<TDocSet>,

View File

@@ -15,6 +15,7 @@ mod term_info;
mod vec_postings;
mod segment_postings;
mod intersection;
mod union;
mod docset;
pub use self::docset::{DocSet, SkipResult};
@@ -30,6 +31,8 @@ pub use self::vec_postings::VecPostings;
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
pub use self::intersection::IntersectionDocSet;
pub use self::union::UnionDocSet;
pub use common::HasLen;
pub(crate) type UnorderedTermId = usize;

152
src/postings/union.rs Normal file
View File

@@ -0,0 +1,152 @@
use postings::DocSet;
use postings::SkipResult;
use common::TinySet;
use DocId;
const HORIZON_NUM_TINYBITSETS: usize = 1_024;
const HORIZON: usize = 64 * HORIZON_NUM_TINYBITSETS;
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
pub struct UnionDocSet<TDocSet: DocSet> {
docsets: Vec<TDocSet>,
bitsets: Box<[u64; HORIZON_NUM_TINYBITSETS]>,
cursor: usize,
offset: DocId,
doc: DocId,
}
impl<TDocSet: DocSet> From<Vec<TDocSet>> for UnionDocSet<TDocSet> {
fn from(docsets: Vec<TDocSet>) -> UnionDocSet<TDocSet> {
let non_empty_docsets: Vec<TDocSet> =
docsets
.into_iter()
.flat_map(|mut docset| {
if docset.advance() {
Some(docset)
} else {
None
}
})
.collect();
UnionDocSet {
docsets: non_empty_docsets,
bitsets: Box::new([0u64; HORIZON_NUM_TINYBITSETS]),
cursor: HORIZON_NUM_TINYBITSETS,
offset: 0,
doc: 0
}
}
}
fn refill<TDocSet: DocSet>(docsets: &mut Vec<TDocSet>, bitsets: &mut [u64; HORIZON_NUM_TINYBITSETS], min_doc: DocId) {
docsets
.drain_filter(|docset| {
let horizon = min_doc + HORIZON_NUM_TINYBITSETS as u32;
loop {
let doc = docset.doc();
if doc >= horizon {
return false;
}
// add this document
let delta = doc - min_doc;
bitsets[(delta / 64) as usize] |= 1 << (delta % 64);
if !docset.advance() {
// remove the docset, it has been entirely consumed.
return true;
}
}
});
}
impl<TDocSet: DocSet> UnionDocSet<TDocSet> {
fn refill(&mut self) -> bool {
if let Some(min_doc) = self.docsets
.iter_mut()
.map(|docset| docset.doc())
.min() {
self.offset = min_doc;
self.cursor = 0;
refill(&mut self.docsets, &mut *self.bitsets, min_doc);
self.advance();
true
} else {
false
}
}
}
impl<TDocSet: DocSet> DocSet for UnionDocSet<TDocSet> {
fn advance(&mut self) -> bool {
while self.cursor < HORIZON_NUM_TINYBITSETS {
if let Some(val) = self.bitsets[self.cursor].pop_lowest() {
self.doc = self.offset + val + (self.cursor as u32) * 64;
return true;
} else {
self.cursor += 1;
}
}
self.refill()
}
fn doc(&self) -> DocId {
self.doc
}
fn size_hint(&self) -> u32 {
0u32
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
let mut reached = false;
self.docsets
.drain_filter(|docset| {
match docset.skip_next(target) {
SkipResult::End => true,
SkipResult::Reached => {
reached = true;
false
},
SkipResult::OverStep => false
}
});
if self.docsets.is_empty() {
SkipResult::End
} else {
if reached {
SkipResult::Reached
} else {
SkipResult::OverStep
}
}
}
}
#[cfg(test)]
mod tests {
use super::UnionDocSet;
use postings::VecPostings;
use postings::DocSet;
#[test]
fn test_union() {
let mut union = UnionDocSet::from(
vec!(
VecPostings::from(vec![1, 3333, 100000000u32]),
VecPostings::from(vec![1,2, 100000000u32]),
VecPostings::from(vec![1,2, 100000000u32]),
VecPostings::from(vec![])
)
);
let mut docsets = vec![];
while union.advance() {
docsets.push(union.doc());
}
assert_eq!(&docsets, &[1u32, 2u32, 3333u32, 100000000u32]);
}
}

View File

@@ -51,6 +51,9 @@ impl Query for BooleanQuery {
fn disable_scoring(&mut self) {
self.scoring_disabled = true;
for &mut (_, ref mut subquery) in &mut self.subqueries {
subquery.disable_scoring();
}
}
}

View File

@@ -1,9 +1,12 @@
use query::Weight;
use core::SegmentReader;
use postings::{IntersectionDocSet, UnionDocSet};
use std::collections::HashMap;
use query::EmptyScorer;
use query::Scorer;
use super::BooleanScorer;
use query::OccurFilter;
use query::ConstScorer;
use query::Occur;
use Result;
@@ -33,19 +36,54 @@ impl Weight for BooleanWeight {
weight.scorer(reader)
}
} else {
let sub_scorers: Vec<Box<Scorer + 'a>> = self.weights
.iter()
.map(|&(_, ref weight)| weight)
.map(|weight| weight.scorer(reader))
.collect::<Result<_>>()?;
let occurs: Vec<Occur> = self.weights
.iter()
.map(|&(ref occur, _)| *occur)
.collect();
let occur_filter = OccurFilter::new(&occurs);
let boolean_scorer = BooleanScorer::new(sub_scorers, occur_filter);
Ok(box boolean_scorer)
}
if self.scoring_disabled {
let mut per_occur_scorers = HashMap::new();
for &(ref occur, ref subweight) in &self.weights {
per_occur_scorers
.entry(occur)
.or_insert_with(Vec::new)
.push(subweight.scorer(reader)?);
}
let mut result_scorer_opt: Option<Box<Scorer>> = per_occur_scorers
.remove(&Occur::Should)
.map(|subscorers| {
assert!(!subscorers.is_empty());
if subscorers.len() == 1 {
subscorers
.into_iter()
.next()
.unwrap() //< we checked the size beforehands
} else {
box ConstScorer::new(UnionDocSet::from(subscorers))
}
});
if let Some(mut subscorers) = per_occur_scorers.remove(&Occur::Must) {
if let Some(should_query) = result_scorer_opt {
subscorers.push(should_query);
}
let intersection_docset = IntersectionDocSet::from(subscorers);
result_scorer_opt = Some(box ConstScorer::new(intersection_docset));
}
if let Some(result_scorer) = result_scorer_opt {
Ok(result_scorer)
} else {
Ok(box EmptyScorer)
}
} else {
let sub_scorers: Vec<Box<Scorer + 'a>> = self.weights
.iter()
.map(|&(_, ref weight)| weight)
.map(|weight| weight.scorer(reader))
.collect::<Result<_>>()?;
let occurs: Vec<Occur> = self.weights
.iter()
.map(|&(ref occur, _)| *occur)
.collect();
let occur_filter = OccurFilter::new(&occurs);
let boolean_scorer = BooleanScorer::new(sub_scorers, occur_filter);
Ok(box boolean_scorer)
}
}
}
}

View File

@@ -1,6 +1,6 @@
/// Defines whether a term in a query must be present,
/// should be present or must not be present.
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
pub enum Occur {
/// For a given document to be considered for scoring,
/// at least one of the document with the Should or the Must

View File

@@ -128,6 +128,24 @@ impl DocSet for PhraseScorer {
fn size_hint(&self) -> u32 {
self.intersection_docset.size_hint()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
if self.intersection_docset.skip_next(target) == SkipResult::End {
SkipResult::End
} else if self.phrase_match() {
if self.doc() == target {
SkipResult::Reached
} else {
SkipResult::OverStep
}
} else {
if self.advance() {
SkipResult::OverStep
} else {
SkipResult::End
}
}
}
}
impl Scorer for PhraseScorer {