mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-26 21:20:40 +00:00
Query optimization: phrase query + union
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
#![feature(optin_builtin_traits)]
|
||||
#![feature(conservative_impl_trait)]
|
||||
#![feature(integer_atomics)]
|
||||
#![feature(drain_filter)]
|
||||
#![cfg_attr(test, feature(test))]
|
||||
#![cfg_attr(test, feature(iterator_step_by))]
|
||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||
@@ -12,6 +13,7 @@
|
||||
#![allow(new_without_default)]
|
||||
#![warn(missing_docs)]
|
||||
|
||||
|
||||
//! # `tantivy`
|
||||
//!
|
||||
//! Tantivy is a search engine library.
|
||||
|
||||
@@ -2,6 +2,7 @@ use postings::DocSet;
|
||||
use postings::SkipResult;
|
||||
use DocId;
|
||||
|
||||
|
||||
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
|
||||
pub struct IntersectionDocSet<TDocSet: DocSet> {
|
||||
docsets: Vec<TDocSet>,
|
||||
|
||||
@@ -15,6 +15,7 @@ mod term_info;
|
||||
mod vec_postings;
|
||||
mod segment_postings;
|
||||
mod intersection;
|
||||
mod union;
|
||||
mod docset;
|
||||
|
||||
pub use self::docset::{DocSet, SkipResult};
|
||||
@@ -30,6 +31,8 @@ pub use self::vec_postings::VecPostings;
|
||||
|
||||
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
|
||||
pub use self::intersection::IntersectionDocSet;
|
||||
pub use self::union::UnionDocSet;
|
||||
|
||||
pub use common::HasLen;
|
||||
|
||||
pub(crate) type UnorderedTermId = usize;
|
||||
|
||||
152
src/postings/union.rs
Normal file
152
src/postings/union.rs
Normal file
@@ -0,0 +1,152 @@
|
||||
use postings::DocSet;
|
||||
use postings::SkipResult;
|
||||
use common::TinySet;
|
||||
use DocId;
|
||||
|
||||
|
||||
const HORIZON_NUM_TINYBITSETS: usize = 1_024;
|
||||
const HORIZON: usize = 64 * HORIZON_NUM_TINYBITSETS;
|
||||
|
||||
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
|
||||
pub struct UnionDocSet<TDocSet: DocSet> {
|
||||
docsets: Vec<TDocSet>,
|
||||
bitsets: Box<[u64; HORIZON_NUM_TINYBITSETS]>,
|
||||
cursor: usize,
|
||||
offset: DocId,
|
||||
doc: DocId,
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> From<Vec<TDocSet>> for UnionDocSet<TDocSet> {
|
||||
fn from(docsets: Vec<TDocSet>) -> UnionDocSet<TDocSet> {
|
||||
let non_empty_docsets: Vec<TDocSet> =
|
||||
docsets
|
||||
.into_iter()
|
||||
.flat_map(|mut docset| {
|
||||
if docset.advance() {
|
||||
Some(docset)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
UnionDocSet {
|
||||
docsets: non_empty_docsets,
|
||||
bitsets: Box::new([0u64; HORIZON_NUM_TINYBITSETS]),
|
||||
cursor: HORIZON_NUM_TINYBITSETS,
|
||||
offset: 0,
|
||||
doc: 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn refill<TDocSet: DocSet>(docsets: &mut Vec<TDocSet>, bitsets: &mut [u64; HORIZON_NUM_TINYBITSETS], min_doc: DocId) {
|
||||
docsets
|
||||
.drain_filter(|docset| {
|
||||
let horizon = min_doc + HORIZON_NUM_TINYBITSETS as u32;
|
||||
loop {
|
||||
let doc = docset.doc();
|
||||
if doc >= horizon {
|
||||
return false;
|
||||
}
|
||||
// add this document
|
||||
let delta = doc - min_doc;
|
||||
bitsets[(delta / 64) as usize] |= 1 << (delta % 64);
|
||||
if !docset.advance() {
|
||||
// remove the docset, it has been entirely consumed.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> UnionDocSet<TDocSet> {
|
||||
fn refill(&mut self) -> bool {
|
||||
if let Some(min_doc) = self.docsets
|
||||
.iter_mut()
|
||||
.map(|docset| docset.doc())
|
||||
.min() {
|
||||
self.offset = min_doc;
|
||||
self.cursor = 0;
|
||||
refill(&mut self.docsets, &mut *self.bitsets, min_doc);
|
||||
self.advance();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for UnionDocSet<TDocSet> {
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.cursor < HORIZON_NUM_TINYBITSETS {
|
||||
if let Some(val) = self.bitsets[self.cursor].pop_lowest() {
|
||||
self.doc = self.offset + val + (self.cursor as u32) * 64;
|
||||
return true;
|
||||
} else {
|
||||
self.cursor += 1;
|
||||
}
|
||||
}
|
||||
self.refill()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
0u32
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
let mut reached = false;
|
||||
self.docsets
|
||||
.drain_filter(|docset| {
|
||||
match docset.skip_next(target) {
|
||||
SkipResult::End => true,
|
||||
SkipResult::Reached => {
|
||||
reached = true;
|
||||
false
|
||||
},
|
||||
SkipResult::OverStep => false
|
||||
}
|
||||
});
|
||||
if self.docsets.is_empty() {
|
||||
SkipResult::End
|
||||
} else {
|
||||
if reached {
|
||||
SkipResult::Reached
|
||||
} else {
|
||||
SkipResult::OverStep
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::UnionDocSet;
|
||||
use postings::VecPostings;
|
||||
use postings::DocSet;
|
||||
|
||||
#[test]
|
||||
fn test_union() {
|
||||
let mut union = UnionDocSet::from(
|
||||
vec!(
|
||||
VecPostings::from(vec![1, 3333, 100000000u32]),
|
||||
VecPostings::from(vec![1,2, 100000000u32]),
|
||||
VecPostings::from(vec![1,2, 100000000u32]),
|
||||
VecPostings::from(vec![])
|
||||
)
|
||||
);
|
||||
let mut docsets = vec![];
|
||||
while union.advance() {
|
||||
docsets.push(union.doc());
|
||||
}
|
||||
assert_eq!(&docsets, &[1u32, 2u32, 3333u32, 100000000u32]);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -51,6 +51,9 @@ impl Query for BooleanQuery {
|
||||
|
||||
fn disable_scoring(&mut self) {
|
||||
self.scoring_disabled = true;
|
||||
for &mut (_, ref mut subquery) in &mut self.subqueries {
|
||||
subquery.disable_scoring();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
use query::Weight;
|
||||
use core::SegmentReader;
|
||||
use postings::{IntersectionDocSet, UnionDocSet};
|
||||
use std::collections::HashMap;
|
||||
use query::EmptyScorer;
|
||||
use query::Scorer;
|
||||
use super::BooleanScorer;
|
||||
use query::OccurFilter;
|
||||
use query::ConstScorer;
|
||||
use query::Occur;
|
||||
use Result;
|
||||
|
||||
@@ -33,19 +36,54 @@ impl Weight for BooleanWeight {
|
||||
weight.scorer(reader)
|
||||
}
|
||||
} else {
|
||||
let sub_scorers: Vec<Box<Scorer + 'a>> = self.weights
|
||||
.iter()
|
||||
.map(|&(_, ref weight)| weight)
|
||||
.map(|weight| weight.scorer(reader))
|
||||
.collect::<Result<_>>()?;
|
||||
let occurs: Vec<Occur> = self.weights
|
||||
.iter()
|
||||
.map(|&(ref occur, _)| *occur)
|
||||
.collect();
|
||||
let occur_filter = OccurFilter::new(&occurs);
|
||||
let boolean_scorer = BooleanScorer::new(sub_scorers, occur_filter);
|
||||
Ok(box boolean_scorer)
|
||||
}
|
||||
if self.scoring_disabled {
|
||||
let mut per_occur_scorers = HashMap::new();
|
||||
for &(ref occur, ref subweight) in &self.weights {
|
||||
per_occur_scorers
|
||||
.entry(occur)
|
||||
.or_insert_with(Vec::new)
|
||||
.push(subweight.scorer(reader)?);
|
||||
}
|
||||
let mut result_scorer_opt: Option<Box<Scorer>> = per_occur_scorers
|
||||
.remove(&Occur::Should)
|
||||
.map(|subscorers| {
|
||||
assert!(!subscorers.is_empty());
|
||||
if subscorers.len() == 1 {
|
||||
subscorers
|
||||
.into_iter()
|
||||
.next()
|
||||
.unwrap() //< we checked the size beforehands
|
||||
} else {
|
||||
box ConstScorer::new(UnionDocSet::from(subscorers))
|
||||
}
|
||||
});
|
||||
if let Some(mut subscorers) = per_occur_scorers.remove(&Occur::Must) {
|
||||
if let Some(should_query) = result_scorer_opt {
|
||||
subscorers.push(should_query);
|
||||
}
|
||||
let intersection_docset = IntersectionDocSet::from(subscorers);
|
||||
result_scorer_opt = Some(box ConstScorer::new(intersection_docset));
|
||||
}
|
||||
|
||||
if let Some(result_scorer) = result_scorer_opt {
|
||||
Ok(result_scorer)
|
||||
} else {
|
||||
Ok(box EmptyScorer)
|
||||
}
|
||||
} else {
|
||||
let sub_scorers: Vec<Box<Scorer + 'a>> = self.weights
|
||||
.iter()
|
||||
.map(|&(_, ref weight)| weight)
|
||||
.map(|weight| weight.scorer(reader))
|
||||
.collect::<Result<_>>()?;
|
||||
let occurs: Vec<Occur> = self.weights
|
||||
.iter()
|
||||
.map(|&(ref occur, _)| *occur)
|
||||
.collect();
|
||||
let occur_filter = OccurFilter::new(&occurs);
|
||||
let boolean_scorer = BooleanScorer::new(sub_scorers, occur_filter);
|
||||
Ok(box boolean_scorer)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/// Defines whether a term in a query must be present,
|
||||
/// should be present or must not be present.
|
||||
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
|
||||
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
|
||||
pub enum Occur {
|
||||
/// For a given document to be considered for scoring,
|
||||
/// at least one of the document with the Should or the Must
|
||||
|
||||
@@ -128,6 +128,24 @@ impl DocSet for PhraseScorer {
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.intersection_docset.size_hint()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if self.intersection_docset.skip_next(target) == SkipResult::End {
|
||||
SkipResult::End
|
||||
} else if self.phrase_match() {
|
||||
if self.doc() == target {
|
||||
SkipResult::Reached
|
||||
} else {
|
||||
SkipResult::OverStep
|
||||
}
|
||||
} else {
|
||||
if self.advance() {
|
||||
SkipResult::OverStep
|
||||
} else {
|
||||
SkipResult::End
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Scorer for PhraseScorer {
|
||||
|
||||
Reference in New Issue
Block a user