Query optimization: phrase query + union

2026-05-26 21:20:40 +00:00 · 2018-02-02 16:39:17 +09:00
parent dd8332c327
commit fb5476d5de
8 changed files with 231 additions and 14 deletions
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -5,6 +5,7 @@
 #![feature(optin_builtin_traits)]
 #![feature(conservative_impl_trait)]
 #![feature(integer_atomics)]
+#![feature(drain_filter)]
 #![cfg_attr(test, feature(test))]
 #![cfg_attr(test, feature(iterator_step_by))]
 #![doc(test(attr(allow(unused_variables), deny(warnings))))]
@@ -12,6 +13,7 @@
 #![allow(new_without_default)]
 #![warn(missing_docs)]

+
 //! # `tantivy`
 //!
 //! Tantivy is a search engine library.
--- a/src/postings/intersection.rs
+++ b/src/postings/intersection.rs
@@ -2,6 +2,7 @@ use postings::DocSet;
 use postings::SkipResult;
 use DocId;

+
 /// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
 pub struct IntersectionDocSet<TDocSet: DocSet> {
    docsets: Vec<TDocSet>,
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -15,6 +15,7 @@ mod term_info;
 mod vec_postings;
 mod segment_postings;
 mod intersection;
+mod union;
 mod docset;

 pub use self::docset::{DocSet, SkipResult};
@@ -30,6 +31,8 @@ pub use self::vec_postings::VecPostings;

 pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
 pub use self::intersection::IntersectionDocSet;
+pub use self::union::UnionDocSet;
+
 pub use common::HasLen;

 pub(crate) type UnorderedTermId = usize;
--- a/src/postings/union.rs
+++ b/src/postings/union.rs
@@ -0,0 +1,152 @@
+use postings::DocSet;
+use postings::SkipResult;
+use common::TinySet;
+use DocId;
+
+
+const HORIZON_NUM_TINYBITSETS: usize = 1_024;
+const HORIZON: usize = 64 * HORIZON_NUM_TINYBITSETS;
+
+/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
+pub struct UnionDocSet<TDocSet: DocSet> {
+    docsets: Vec<TDocSet>,
+    bitsets: Box<[u64; HORIZON_NUM_TINYBITSETS]>,
+    cursor: usize,
+    offset: DocId,
+    doc: DocId,
+}
+
+impl<TDocSet: DocSet> From<Vec<TDocSet>> for UnionDocSet<TDocSet> {
+    fn from(docsets: Vec<TDocSet>) -> UnionDocSet<TDocSet> {
+        let non_empty_docsets: Vec<TDocSet> =
+            docsets
+                .into_iter()
+                .flat_map(|mut docset| {
+                    if docset.advance() {
+                        Some(docset)
+                    } else {
+                        None
+                    }
+                })
+                .collect();
+        UnionDocSet {
+            docsets: non_empty_docsets,
+            bitsets: Box::new([0u64; HORIZON_NUM_TINYBITSETS]),
+            cursor: HORIZON_NUM_TINYBITSETS,
+            offset: 0,
+            doc: 0
+        }
+    }
+}
+
+
+fn refill<TDocSet: DocSet>(docsets: &mut Vec<TDocSet>, bitsets: &mut [u64; HORIZON_NUM_TINYBITSETS], min_doc: DocId) {
+    docsets
+        .drain_filter(|docset| {
+            let horizon = min_doc + HORIZON_NUM_TINYBITSETS as u32;
+            loop {
+                let doc = docset.doc();
+                if doc >= horizon {
+                    return false;
+                }
+                // add this document
+                let delta = doc - min_doc;
+                bitsets[(delta / 64) as usize] |= 1 << (delta % 64);
+                if !docset.advance() {
+                    // remove the docset, it has been entirely consumed.
+                    return true;
+                }
+            }
+        });
+}
+
+impl<TDocSet: DocSet> UnionDocSet<TDocSet> {
+    fn refill(&mut self) -> bool {
+        if let Some(min_doc) = self.docsets
+            .iter_mut()
+            .map(|docset| docset.doc())
+            .min() {
+            self.offset = min_doc;
+            self.cursor = 0;
+            refill(&mut self.docsets, &mut *self.bitsets, min_doc);
+            self.advance();
+            true
+        } else {
+            false
+        }
+    }
+}
+
+impl<TDocSet: DocSet> DocSet for UnionDocSet<TDocSet> {
+
+    fn advance(&mut self) -> bool {
+        while self.cursor < HORIZON_NUM_TINYBITSETS {
+            if let Some(val) = self.bitsets[self.cursor].pop_lowest() {
+                self.doc = self.offset + val + (self.cursor as u32) * 64;
+                return true;
+            } else {
+                self.cursor += 1;
+            }
+        }
+        self.refill()
+    }
+
+    fn doc(&self) -> DocId {
+        self.doc
+    }
+
+    fn size_hint(&self) -> u32 {
+        0u32
+    }
+
+    fn skip_next(&mut self, target: DocId) -> SkipResult {
+        let mut reached = false;
+        self.docsets
+            .drain_filter(|docset| {
+                match docset.skip_next(target) {
+                    SkipResult::End => true,
+                    SkipResult::Reached => {
+                        reached = true;
+                        false
+                    },
+                    SkipResult::OverStep => false
+                }
+            });
+        if self.docsets.is_empty() {
+            SkipResult::End
+        } else {
+            if reached {
+                SkipResult::Reached
+            } else {
+                SkipResult::OverStep
+            }
+        }
+    }
+}
+
+
+#[cfg(test)]
+mod tests {
+
+    use super::UnionDocSet;
+    use postings::VecPostings;
+    use postings::DocSet;
+
+    #[test]
+    fn test_union() {
+        let mut union = UnionDocSet::from(
+            vec!(
+                VecPostings::from(vec![1, 3333, 100000000u32]),
+                VecPostings::from(vec![1,2, 100000000u32]),
+                VecPostings::from(vec![1,2, 100000000u32]),
+                VecPostings::from(vec![])
+            )
+        );
+        let mut docsets = vec![];
+        while union.advance() {
+            docsets.push(union.doc());
+        }
+        assert_eq!(&docsets, &[1u32, 2u32, 3333u32, 100000000u32]);
+    }
+
+}
--- a/src/query/boolean_query/boolean_query.rs
+++ b/src/query/boolean_query/boolean_query.rs
@@ -51,6 +51,9 @@ impl Query for BooleanQuery {

    fn disable_scoring(&mut self) {
        self.scoring_disabled = true;
+        for &mut (_, ref mut subquery) in &mut self.subqueries {
+            subquery.disable_scoring();
+        }
    }
 }

--- a/src/query/boolean_query/boolean_weight.rs
+++ b/src/query/boolean_query/boolean_weight.rs
@@ -1,9 +1,12 @@
 use query::Weight;
 use core::SegmentReader;
+use postings::{IntersectionDocSet, UnionDocSet};
+use std::collections::HashMap;
 use query::EmptyScorer;
 use query::Scorer;
 use super::BooleanScorer;
 use query::OccurFilter;
+use query::ConstScorer;
 use query::Occur;
 use Result;

@@ -33,19 +36,54 @@ impl Weight for BooleanWeight {
                weight.scorer(reader)
            }
        } else {
-            let sub_scorers: Vec<Box<Scorer + 'a>> = self.weights
-                .iter()
-                .map(|&(_, ref weight)| weight)
-                .map(|weight| weight.scorer(reader))
-                .collect::<Result<_>>()?;
-            let occurs: Vec<Occur> = self.weights
-                .iter()
-                .map(|&(ref occur, _)| *occur)
-                .collect();
-            let occur_filter = OccurFilter::new(&occurs);
-            let boolean_scorer = BooleanScorer::new(sub_scorers, occur_filter);
-            Ok(box boolean_scorer)
-        }
+            if self.scoring_disabled {
+                let mut per_occur_scorers = HashMap::new();
+                for &(ref occur, ref subweight) in &self.weights {
+                    per_occur_scorers
+                        .entry(occur)
+                        .or_insert_with(Vec::new)
+                        .push(subweight.scorer(reader)?);
+                }
+                let mut result_scorer_opt: Option<Box<Scorer>> = per_occur_scorers
+                    .remove(&Occur::Should)
+                    .map(|subscorers| {
+                        assert!(!subscorers.is_empty());
+                        if subscorers.len() == 1 {
+                            subscorers
+                                .into_iter()
+                                .next()
+                                .unwrap() //< we checked the size beforehands
+                        } else {
+                            box ConstScorer::new(UnionDocSet::from(subscorers))
+                        }
+                    });
+                if let Some(mut subscorers) = per_occur_scorers.remove(&Occur::Must) {
+                    if let Some(should_query) = result_scorer_opt {
+                        subscorers.push(should_query);
+                    }
+                    let intersection_docset = IntersectionDocSet::from(subscorers);
+                    result_scorer_opt = Some(box ConstScorer::new(intersection_docset));
+                }

+                if let Some(result_scorer) = result_scorer_opt {
+                    Ok(result_scorer)
+                } else {
+                    Ok(box EmptyScorer)
+                }
+            } else {
+                let sub_scorers: Vec<Box<Scorer + 'a>> = self.weights
+                    .iter()
+                    .map(|&(_, ref weight)| weight)
+                    .map(|weight| weight.scorer(reader))
+                    .collect::<Result<_>>()?;
+                let occurs: Vec<Occur> = self.weights
+                    .iter()
+                    .map(|&(ref occur, _)| *occur)
+                    .collect();
+                let occur_filter = OccurFilter::new(&occurs);
+                let boolean_scorer = BooleanScorer::new(sub_scorers, occur_filter);
+                Ok(box boolean_scorer)
+            }
+        }
   }
 }
--- a/src/query/occur.rs
+++ b/src/query/occur.rs
@@ -1,6 +1,6 @@
 /// Defines whether a term in a query must be present,
 /// should be present or must not be present.
-#[derive(Debug, Clone, Copy, Eq, PartialEq)]
+#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
 pub enum Occur {
    /// For a given document to be considered for scoring,
    /// at least one of the document with the Should or the Must
--- a/src/query/phrase_query/phrase_scorer.rs
+++ b/src/query/phrase_query/phrase_scorer.rs
@@ -128,6 +128,24 @@ impl DocSet for PhraseScorer {
    fn size_hint(&self) -> u32 {
        self.intersection_docset.size_hint()
    }
+
+    fn skip_next(&mut self, target: DocId) -> SkipResult {
+        if self.intersection_docset.skip_next(target) == SkipResult::End {
+            SkipResult::End
+        } else if self.phrase_match() {
+            if self.doc() == target {
+                SkipResult::Reached
+            } else {
+                SkipResult::OverStep
+            }
+        } else {
+            if self.advance() {
+                SkipResult::OverStep
+            } else {
+                SkipResult::End
+            }
+        }
+    }
 }

 impl Scorer for PhraseScorer {