Added bitse

2026-06-03 00:50:41 +00:00 · 2018-01-31 23:56:54 +09:00
parent 340693184f
commit 1947a19700
17 changed files with 479 additions and 37 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+*.swp
 target
 target/debug
 .vscode
@@ -8,4 +9,4 @@ benchmark
 cpp/simdcomp/bitpackingbenchmark
 *.bk
 .idea
-trace.dat
+trace.dat
--- a/.vimrc
+++ b/.vimrc
@@ -0,0 +1,13 @@
+set wildignore+=*/examples/*
+
+set tabstop=2
+set shiftwidth=2
+set softtabstop=2
+set expandtab
+set nosmarttab
+
+set textwidth=100
+
+autocmd BufRead *.rs :setlocal tags=./rusty-tags.vi;/
+autocmd BufWritePost *.rs :silent! exec "!rusty-tags vi -o --quiet --start-dir=" . expand('%:p:h') . "&" | redraw!
+
--- a/src/common/bitset.rs
+++ b/src/common/bitset.rs
@@ -0,0 +1,210 @@
+use DocId;
+
+pub trait TinySet {
+    fn insert(&mut self, b: u32);
+    fn is_empty(&self) -> bool;
+    fn pop_lowest(&mut self) -> Option<u32>;
+    fn remove(&mut self, b: u32);
+    fn lowest(&mut self) -> Option<u32>;
+
+    /// Update self to represent the
+    /// intersection of its elements and the other
+    /// set given in arguments.
+    fn intersect(&mut self, other: Self);
+
+    /// Returns a `TinySet` than contains all values up
+    /// to limit excluded.
+    ///
+    /// The limit is assumed to be strictly lower than 64.
+    fn range_lower(limit: u32) -> u64;
+
+    /// Returns a `TinySet` that contains all values greater
+    /// or equal to the given limit, included. (and up to 63)
+    ///
+    /// The limit is assumed to be strictly lower than 64.
+    fn range_greater_or_equal(from_included: u32) -> u64 {
+        assert!(from_included < 64);
+        0 ^ Self::range_lower(from_included)
+    }
+}
+
+impl TinySet for u64 {
+    fn range_lower(from_included: u32) -> u64 {
+        assert!(from_included < 64);
+        (1u64 << (from_included as u64)) - 1u64
+    }
+
+    fn intersect(&mut self, filter_mask: u64) {
+        *self &= filter_mask;
+    }
+
+    #[inline(always)]
+    fn insert(&mut self, b: u32) {
+        *self |= 1u64 << (b as u64);
+    }
+
+    #[inline(always)]
+    fn is_empty(&self) -> bool {
+        *self == 0u64
+    }
+
+    #[inline(always)]
+    fn pop_lowest(&mut self) -> Option<u32> {
+        if let Some(lowest) = self.lowest() {
+            self.remove(lowest);
+            Some(lowest)
+        } else {
+            None
+        }
+    }
+
+    #[inline(always)]
+    fn remove(&mut self, b: u32) {
+        *self ^= 1 << (b as u64);
+    }
+
+    #[inline(always)]
+    fn lowest(&mut self) -> Option<u32> {
+        if self.is_empty() {
+            None
+        } else {
+            let least_significant_bit = self.trailing_zeros() as u32;
+            Some(least_significant_bit)
+        }
+    }
+}
+
+pub struct DocBitSet {
+    tinybitsets: Box<[u64]>,
+    size_hint: usize, //< Technically it should be u32, but we
+                      // count multiple inserts.
+                      // `usize` guards us from overflow.
+    max_doc: DocId
+}
+
+impl DocBitSet {
+    pub fn with_maxdoc(max_doc: DocId) -> DocBitSet {
+        let num_buckets = (max_doc + 63) / 64;
+        DocBitSet {
+            tinybitsets: vec![0u64; num_buckets as usize].into_boxed_slice(),
+            size_hint: 0,
+            max_doc
+        }
+    }
+
+    pub fn size_hint(&self) -> u32 {
+        if self.max_doc as usize > self.size_hint {
+            self.size_hint as u32
+        } else {
+            self.max_doc
+        }
+    }
+
+    pub fn insert(&mut self, doc: DocId) {
+        // we do not check saturated els.
+        self.size_hint += 1;
+        let bucket = (doc / 64u32) as usize;
+        self.tinybitsets[bucket].insert(doc % 64u32);
+    }
+
+    pub fn contains(&self, doc: DocId) -> bool {
+        let tiny_bitset = self.tiny_bitset((doc / 64u32) as usize);
+        let lower = doc % 64;
+        let mask = 1u64 << (lower as u64);
+        (tiny_bitset & mask) != 0u64
+    }
+
+    pub fn max_doc(&self) -> DocId {
+        self.max_doc
+    }
+
+    pub fn num_tiny_bitsets(&self) -> usize {
+        self.tinybitsets.len()
+    }
+
+    pub fn tiny_bitset(&self, bucket: usize) -> u64 {
+        self.tinybitsets[bucket]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashSet;
+    use DocId;
+    use super::TinySet;
+    use super::DocBitSet;
+
+    #[test]
+    fn test_tiny_set() {
+        assert!(0u64.is_empty());
+        {
+            let mut u = 0u64;
+            u.insert(1u32);
+            assert_eq!(u.pop_lowest(), Some(1u32));
+            assert!(u.pop_lowest().is_none())
+        }
+        {
+            let mut u = 0u64;
+            u.insert(1u32);
+            u.insert(1u32);
+            assert_eq!(u.pop_lowest(), Some(1u32));
+            assert!(u.pop_lowest().is_none())
+        }
+        {
+            let mut u = 0u64;
+            u.insert(2u32);
+            assert_eq!(u.pop_lowest(), Some(2u32));
+            u.insert(1u32);
+            assert_eq!(u.pop_lowest(), Some(1u32));
+            assert!(u.pop_lowest().is_none());
+        }
+        {
+            let mut u = 0u64;
+            u.insert(63u32);
+            assert_eq!(u.pop_lowest(), Some(63u32));
+            assert!(u.pop_lowest().is_none());
+        }
+    }
+
+
+    #[test]
+    fn test_docbitset() {
+        // docs are assumed to be lower than 100.
+        let test_against_hashset = |docs: &[DocId], max_doc: u32| {
+            let mut hashset: HashSet<DocId> = HashSet::new();
+            let mut docbitset = DocBitSet::with_maxdoc(max_doc);
+            for &doc in docs {
+                assert!(doc < max_doc);
+                hashset.insert(doc);
+                docbitset.insert(doc);
+            }
+            for doc in 0..max_doc {
+                assert_eq!(
+                    hashset.contains(&doc),
+                    docbitset.contains(doc)
+                );
+            }
+            assert_eq!(docbitset.max_doc(), max_doc);
+        };
+
+        test_against_hashset(&[], 0);
+        test_against_hashset(&[], 1);
+        test_against_hashset(&[0u32], 1);
+        test_against_hashset(&[0u32], 100);
+        test_against_hashset(&[1u32, 2u32], 4);
+        test_against_hashset(&[99u32], 100);
+        test_against_hashset(&[63u32], 64);
+        test_against_hashset(&[62u32,63u32], 64);
+    }
+
+    #[test]
+    fn test_docbitset_num_buckets() {
+        assert_eq!(DocBitSet::with_maxdoc(0u32).num_tiny_bitsets(), 0);
+        assert_eq!(DocBitSet::with_maxdoc(1u32).num_tiny_bitsets(), 1);
+        assert_eq!(DocBitSet::with_maxdoc(64u32).num_tiny_bitsets(), 1);
+        assert_eq!(DocBitSet::with_maxdoc(65u32).num_tiny_bitsets(), 2);
+        assert_eq!(DocBitSet::with_maxdoc(128u32).num_tiny_bitsets(), 2);
+        assert_eq!(DocBitSet::with_maxdoc(129u32).num_tiny_bitsets(), 3);
+    }
+}
+
--- a/src/common/mod.rs
+++ b/src/common/mod.rs
@@ -4,6 +4,7 @@ mod vint;
 mod counting_writer;
 mod composite_file;
 pub mod bitpacker;
+mod bitset;

 pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
 pub use self::serialize::BinarySerializable;
@@ -12,6 +13,7 @@ pub use self::timer::TimerTree;
 pub use self::timer::OpenTimer;
 pub use self::vint::VInt;
 pub use self::counting_writer::CountingWriter;
+pub use self::bitset::{TinySet, DocBitSet};

 use std::io;

--- a/src/postings/docset.rs
+++ b/src/postings/docset.rs
@@ -92,7 +92,7 @@ pub trait DocSet {

    /// Returns a best-effort hint of the
    /// length of the docset.
-    fn size_hint(&self) -> usize;
+    fn size_hint(&self) -> u32;
 }

 impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
@@ -111,7 +111,7 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
        unboxed.doc()
    }

-    fn size_hint(&self) -> usize {
+    fn size_hint(&self) -> u32 {
        let unboxed: &TDocSet = self.borrow();
        unboxed.size_hint()
    }
@@ -133,7 +133,7 @@ impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
        unref.doc()
    }

-    fn size_hint(&self) -> usize {
+    fn size_hint(&self) -> u32 {
        let unref: &TDocSet = *self;
        unref.size_hint()
    }
--- a/src/postings/intersection.rs
+++ b/src/postings/intersection.rs
@@ -31,7 +31,8 @@ impl<TDocSet: DocSet> IntersectionDocSet<TDocSet> {
 }

 impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> {
-    fn size_hint(&self) -> usize {
+    /// Returns the minimum `.size_hint()` of the intersected docsets.
+    fn size_hint(&self) -> u32 {
        self.docsets
            .iter()
            .map(|docset| docset.size_hint())
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -235,8 +235,8 @@ impl DocSet for SegmentPostings {
        }
    }

-    fn size_hint(&self) -> usize {
-        self.len()
+    fn size_hint(&self) -> u32 {
+        self.len() as u32
    }

    /// Return the current document's `DocId`.
--- a/src/postings/vec_postings.rs
+++ b/src/postings/vec_postings.rs
@@ -35,8 +35,8 @@ impl DocSet for VecPostings {
        self.doc_ids[self.cursor.0]
    }

-    fn size_hint(&self) -> usize {
-        self.len()
+    fn size_hint(&self) -> u32 {
+        self.len() as u32
    }
 }

--- a/src/query/all_query.rs
+++ b/src/query/all_query.rs
@@ -62,8 +62,8 @@ impl DocSet for AllScorer {
        self.doc
    }

-    fn size_hint(&self) -> usize {
-        self.max_doc as usize
+    fn size_hint(&self) -> u32 {
+        self.max_doc
    }
 }

--- a/src/query/bitset/mod.rs
+++ b/src/query/bitset/mod.rs
@@ -0,0 +1,199 @@
+use common::{DocBitSet, TinySet};
+use DocId;
+use postings::DocSet;
+use postings::SkipResult;
+use std::cmp::Ordering;
+
+/// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`.
+///
+/// # Implementation detail
+///
+/// Skipping is relatively fast here as we can directly point to the 
+/// right tiny bitset bucket.
+///
+/// TODO: Consider implementing a `BitTreeSet` in order to advance faster 
+/// when the bitset is sparse
+pub struct BitSetDocSet {
+    docs: DocBitSet,
+    cursor_bucket: usize, //< index associated to the current tiny bitset
+    cursor_tinybitset: u64,
+    doc: u32
+}
+
+impl From<DocBitSet> for BitSetDocSet {
+    fn from(docs: DocBitSet) -> BitSetDocSet {
+        let first_tiny_bitset = 
+            if docs.num_tiny_bitsets() == 0 {
+                0u64
+            } else {
+                docs.tiny_bitset(0) as u64
+            };
+        BitSetDocSet {
+            docs,
+            cursor_bucket: 0,
+            cursor_tinybitset: first_tiny_bitset,
+            doc: 0u32
+        }
+    }
+}
+
+impl DocSet for BitSetDocSet {
+    fn advance(&mut self) -> bool {
+        loop {
+            if let Some(lower) = self.cursor_tinybitset.pop_lowest() {
+                self.doc = (self.cursor_bucket as u32 * 64u32) | lower;
+                return true;
+            } else {
+                if self.cursor_bucket < self.docs.num_tiny_bitsets() - 1 {
+                    self.cursor_bucket += 1;
+                    self.cursor_tinybitset = self.docs.tiny_bitset(self.cursor_bucket);
+                } else {
+                    return false;
+                }
+            }
+
+        }
+    }
+
+    
+    fn skip_next(&mut self, target: DocId) -> SkipResult {
+        // skip is required to advance.
+        if !self.advance() {
+            return SkipResult::End;
+        }
+        let target_bucket = (target / 64u32) as usize;
+        
+        // Mask for all of the bits greater or equal
+        // to our target document.
+        match target_bucket.cmp(&self.cursor_bucket) {
+            Ordering::Less => {
+                self.cursor_bucket = target_bucket;
+                self.cursor_tinybitset = self.docs.tiny_bitset(target_bucket);
+                let greater: u64 = <u64 as TinySet>::range_greater_or_equal(target % 64);
+                self.cursor_tinybitset.intersect(greater);
+                if !self.advance() {
+                    SkipResult::End
+                } else {
+                    if self.doc() == target {
+                        SkipResult::Reached
+                    } else {
+                        SkipResult::OverStep
+                    }
+                }
+            }
+            Ordering::Equal => {
+                loop {
+                    match self.doc().cmp(&target) {
+                        Ordering::Less => {
+                            if !self.advance() {
+                                return SkipResult::End;
+                            }
+                        }
+                        Ordering::Equal => {
+                            return SkipResult::Reached;
+                        }
+                        Ordering::Greater => {
+                            return SkipResult::OverStep;
+                        }
+                    }
+                }
+            }
+            Ordering::Greater => SkipResult::OverStep
+        }
+    }
+
+    /// Returns the current document
+    fn doc(&self) -> DocId {
+        self.doc
+    }
+
+    /// Advances the cursor to the next document
+    /// None is returned if the iterator has `DocSet`
+    /// has already been entirely consumed.
+    fn next(&mut self) -> Option<DocId> {
+        if self.advance() {
+            Some(self.doc())
+        } else {
+            None
+        }
+    }
+
+    /// Returns half of the `max_doc`
+    /// This is quite a terrible heuristic,
+    /// but we don't have access to any better
+    /// value.
+    fn size_hint(&self) -> u32 {
+        self.docs.size_hint()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use DocId;
+    use common::DocBitSet;
+    use postings::{SkipResult, DocSet};
+    use super::BitSetDocSet;
+
+    fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet {
+        let mut docset = DocBitSet::with_maxdoc(max_doc);
+        for &doc in docs {
+            docset.insert(doc);
+        }
+        BitSetDocSet::from(docset)
+    }
+
+    fn test_go_through_sequential(docs: &[DocId]) {
+        let mut docset = create_docbitset(docs, 1_000u32);
+        for &doc in docs {
+            assert!(docset.advance());
+            assert_eq!(doc, docset.doc());
+        }
+        assert!(!docset.advance());
+        assert!(!docset.advance());
+    }
+
+    #[test]
+    fn test_docbitset_sequential() {
+        test_go_through_sequential(&[]);
+        test_go_through_sequential(&[1,2,3]);
+        test_go_through_sequential(&[1,2,3,4,5,63,64,65]);
+        test_go_through_sequential(&[63,64,65]);
+        test_go_through_sequential(&[1,2,3,4,95,96,97,98,99]);
+    }
+
+    #[test]
+    fn test_docbitset_skip() {
+        {
+            let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000);
+            assert_eq!(docset.skip_next(7), SkipResult::Reached);
+            assert_eq!(docset.doc(), 7);
+            assert!(docset.advance(), 7);
+            assert_eq!(docset.doc(), 5112);
+            assert!(!docset.advance());
+        }
+        {
+            let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000);
+            assert_eq!(docset.skip_next(3), SkipResult::OverStep);
+            assert_eq!(docset.doc(), 5);
+            assert!(docset.advance());
+        }
+        {
+            let mut docset = create_docbitset(&[5112], 10_000);
+            assert_eq!(docset.skip_next(5112), SkipResult::Reached);
+            assert_eq!(docset.doc(), 5112);
+            assert!(!docset.advance());
+        }
+        {
+            let mut docset = create_docbitset(&[5112], 10_000);
+            assert_eq!(docset.skip_next(5113), SkipResult::End);
+            assert!(!docset.advance());
+        }
+        {
+            let mut docset = create_docbitset(&[5112], 10_000);
+            assert_eq!(docset.skip_next(5111), SkipResult::OverStep);
+            assert_eq!(docset.doc(), 5112);
+            assert!(!docset.advance());
+        }
+    }
+
+}
--- a/src/query/boolean_query/boolean_query.rs
+++ b/src/query/boolean_query/boolean_query.rs
@@ -8,7 +8,6 @@ use schema::Term;
 use query::TermQuery;
 use schema::IndexRecordOption;
 use query::Occur;
-use query::OccurFilter;

 /// The boolean query combines a set of queries
 ///
@@ -39,14 +38,11 @@ impl Query for BooleanQuery {
    fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
        let sub_weights = self.subqueries
            .iter()
-            .map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
+            .map(|&(ref occur, ref subquery)| {
+                Ok((*occur, subquery.weight(searcher)?))
+            })
            .collect::<Result<_>>()?;
-        let occurs: Vec<Occur> = self.subqueries
-            .iter()
-            .map(|&(ref occur, ref _subquery)| *occur)
-            .collect();
-        let filter = OccurFilter::new(&occurs);
-        Ok(box BooleanWeight::new(sub_weights, filter))
+        Ok(box BooleanWeight::new(sub_weights))
    }
 }

--- a/src/query/boolean_query/boolean_scorer.rs
+++ b/src/query/boolean_query/boolean_scorer.rs
@@ -90,7 +90,7 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
 }

 impl<TScorer: Scorer> DocSet for BooleanScorer<TScorer> {
-    fn size_hint(&self) -> usize {
+    fn size_hint(&self) -> u32 {
        // TODO fix this. it should be the min
        // of the MUST scorer
        // and the max of the SHOULD scorers.
--- a/src/query/boolean_query/boolean_weight.rs
+++ b/src/query/boolean_query/boolean_weight.rs
@@ -1,31 +1,49 @@
 use query::Weight;
 use core::SegmentReader;
+use query::EmptyScorer;
 use query::Scorer;
 use super::BooleanScorer;
 use query::OccurFilter;
+use query::Occur;
 use Result;

 pub struct BooleanWeight {
-    weights: Vec<Box<Weight>>,
-    occur_filter: OccurFilter,
+    weights: Vec<(Occur, Box<Weight>)>,
 }

 impl BooleanWeight {
-    pub fn new(weights: Vec<Box<Weight>>, occur_filter: OccurFilter) -> BooleanWeight {
+    pub fn new(weights: Vec<(Occur, Box<Weight>)>) -> BooleanWeight {
        BooleanWeight {
-            weights,
-            occur_filter,
+            weights
        }
    }
 }

 impl Weight for BooleanWeight {
    fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
-        let sub_scorers: Vec<Box<Scorer + 'a>> = self.weights
-            .iter()
-            .map(|weight| weight.scorer(reader))
-            .collect::<Result<_>>()?;
-        let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter);
-        Ok(box boolean_scorer)
-    }
+        if self.weights.is_empty() {
+            Ok(box EmptyScorer)
+        } else if self.weights.len() == 1 {
+            let &(occur, ref weight) =  &self.weights[0];
+            if occur == Occur::MustNot {
+                Ok(box EmptyScorer)
+            } else {
+                weight.scorer(reader)
+            }
+        } else {
+            let sub_scorers: Vec<Box<Scorer + 'a>> = self.weights
+                .iter()
+                .map(|&(_, ref weight)| weight)
+                .map(|weight| weight.scorer(reader))
+                .collect::<Result<_>>()?;
+            let occurs: Vec<Occur> = self.weights
+                .iter()
+                .map(|&(ref occur, _)| *occur)
+                .collect();
+            let occur_filter = OccurFilter::new(&occurs);
+            let boolean_scorer = BooleanScorer::new(sub_scorers, occur_filter);
+            Ok(box boolean_scorer)
+        }
+
+   }
 }
--- a/src/query/mod.rs
+++ b/src/query/mod.rs
@@ -12,7 +12,9 @@ mod term_query;
 mod query_parser;
 mod phrase_query;
 mod all_query;
+mod bitset;

+pub use self::bitset::BitSetDocSet;
 pub use self::boolean_query::BooleanQuery;
 pub use self::occur_filter::OccurFilter;
 pub use self::occur::Occur;
@@ -24,4 +26,4 @@ pub use self::scorer::EmptyScorer;
 pub use self::scorer::Scorer;
 pub use self::term_query::TermQuery;
 pub use self::weight::Weight;
-pub use self::all_query::{AllQuery, AllWeight, AllScorer};
+pub use self::all_query::{AllQuery, AllWeight, AllScorer};
--- a/src/query/phrase_query/phrase_scorer.rs
+++ b/src/query/phrase_query/phrase_scorer.rs
@@ -35,7 +35,7 @@ impl DocSet for PostingsWithOffset {
        self.segment_postings.doc()
    }

-    fn size_hint(&self) -> usize {
+    fn size_hint(&self) -> u32 {
        self.segment_postings.size_hint()
    }

@@ -125,7 +125,7 @@ impl DocSet for PhraseScorer {
        self.intersection_docset.doc()
    }

-    fn size_hint(&self) -> usize {
+    fn size_hint(&self) -> u32 {
        self.intersection_docset.size_hint()
    }
 }
--- a/src/query/scorer.rs
+++ b/src/query/scorer.rs
@@ -49,7 +49,7 @@ impl DocSet for EmptyScorer {
        DocId::max_value()
    }

-    fn size_hint(&self) -> usize {
+    fn size_hint(&self) -> u32 {
        0
    }
 }
--- a/src/query/term_query/term_scorer.rs
+++ b/src/query/term_query/term_scorer.rs
@@ -36,7 +36,7 @@ where
        self.postings.doc()
    }

-    fn size_hint(&self) -> usize {
+    fn size_hint(&self) -> u32 {
        self.postings.size_hint()
    }
 }