From 1947a19700b9bce2dd0a9606e2bc7400f82f291f Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 31 Jan 2018 23:56:54 +0900 Subject: [PATCH] Added bitse --- .gitignore | 3 +- .vimrc | 13 ++ src/common/bitset.rs | 210 ++++++++++++++++++++++ src/common/mod.rs | 2 + src/postings/docset.rs | 6 +- src/postings/intersection.rs | 3 +- src/postings/segment_postings.rs | 4 +- src/postings/vec_postings.rs | 4 +- src/query/all_query.rs | 4 +- src/query/bitset/mod.rs | 199 ++++++++++++++++++++ src/query/boolean_query/boolean_query.rs | 12 +- src/query/boolean_query/boolean_scorer.rs | 2 +- src/query/boolean_query/boolean_weight.rs | 42 +++-- src/query/mod.rs | 4 +- src/query/phrase_query/phrase_scorer.rs | 4 +- src/query/scorer.rs | 2 +- src/query/term_query/term_scorer.rs | 2 +- 17 files changed, 479 insertions(+), 37 deletions(-) create mode 100644 .vimrc create mode 100644 src/common/bitset.rs create mode 100644 src/query/bitset/mod.rs diff --git a/.gitignore b/.gitignore index e2a04b58a..b6f5cc5b8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +*.swp target target/debug .vscode @@ -8,4 +9,4 @@ benchmark cpp/simdcomp/bitpackingbenchmark *.bk .idea -trace.dat \ No newline at end of file +trace.dat diff --git a/.vimrc b/.vimrc new file mode 100644 index 000000000..29eb2867c --- /dev/null +++ b/.vimrc @@ -0,0 +1,13 @@ +set wildignore+=*/examples/* + +set tabstop=2 +set shiftwidth=2 +set softtabstop=2 +set expandtab +set nosmarttab + +set textwidth=100 + +autocmd BufRead *.rs :setlocal tags=./rusty-tags.vi;/ +autocmd BufWritePost *.rs :silent! exec "!rusty-tags vi -o --quiet --start-dir=" . expand('%:p:h') . "&" | redraw! + diff --git a/src/common/bitset.rs b/src/common/bitset.rs new file mode 100644 index 000000000..e8345afd4 --- /dev/null +++ b/src/common/bitset.rs @@ -0,0 +1,210 @@ +use DocId; + +pub trait TinySet { + fn insert(&mut self, b: u32); + fn is_empty(&self) -> bool; + fn pop_lowest(&mut self) -> Option; + fn remove(&mut self, b: u32); + fn lowest(&mut self) -> Option; + + /// Update self to represent the + /// intersection of its elements and the other + /// set given in arguments. + fn intersect(&mut self, other: Self); + + /// Returns a `TinySet` than contains all values up + /// to limit excluded. + /// + /// The limit is assumed to be strictly lower than 64. + fn range_lower(limit: u32) -> u64; + + /// Returns a `TinySet` that contains all values greater + /// or equal to the given limit, included. (and up to 63) + /// + /// The limit is assumed to be strictly lower than 64. + fn range_greater_or_equal(from_included: u32) -> u64 { + assert!(from_included < 64); + 0 ^ Self::range_lower(from_included) + } +} + +impl TinySet for u64 { + fn range_lower(from_included: u32) -> u64 { + assert!(from_included < 64); + (1u64 << (from_included as u64)) - 1u64 + } + + fn intersect(&mut self, filter_mask: u64) { + *self &= filter_mask; + } + + #[inline(always)] + fn insert(&mut self, b: u32) { + *self |= 1u64 << (b as u64); + } + + #[inline(always)] + fn is_empty(&self) -> bool { + *self == 0u64 + } + + #[inline(always)] + fn pop_lowest(&mut self) -> Option { + if let Some(lowest) = self.lowest() { + self.remove(lowest); + Some(lowest) + } else { + None + } + } + + #[inline(always)] + fn remove(&mut self, b: u32) { + *self ^= 1 << (b as u64); + } + + #[inline(always)] + fn lowest(&mut self) -> Option { + if self.is_empty() { + None + } else { + let least_significant_bit = self.trailing_zeros() as u32; + Some(least_significant_bit) + } + } +} + +pub struct DocBitSet { + tinybitsets: Box<[u64]>, + size_hint: usize, //< Technically it should be u32, but we + // count multiple inserts. + // `usize` guards us from overflow. + max_doc: DocId +} + +impl DocBitSet { + pub fn with_maxdoc(max_doc: DocId) -> DocBitSet { + let num_buckets = (max_doc + 63) / 64; + DocBitSet { + tinybitsets: vec![0u64; num_buckets as usize].into_boxed_slice(), + size_hint: 0, + max_doc + } + } + + pub fn size_hint(&self) -> u32 { + if self.max_doc as usize > self.size_hint { + self.size_hint as u32 + } else { + self.max_doc + } + } + + pub fn insert(&mut self, doc: DocId) { + // we do not check saturated els. + self.size_hint += 1; + let bucket = (doc / 64u32) as usize; + self.tinybitsets[bucket].insert(doc % 64u32); + } + + pub fn contains(&self, doc: DocId) -> bool { + let tiny_bitset = self.tiny_bitset((doc / 64u32) as usize); + let lower = doc % 64; + let mask = 1u64 << (lower as u64); + (tiny_bitset & mask) != 0u64 + } + + pub fn max_doc(&self) -> DocId { + self.max_doc + } + + pub fn num_tiny_bitsets(&self) -> usize { + self.tinybitsets.len() + } + + pub fn tiny_bitset(&self, bucket: usize) -> u64 { + self.tinybitsets[bucket] + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + use DocId; + use super::TinySet; + use super::DocBitSet; + + #[test] + fn test_tiny_set() { + assert!(0u64.is_empty()); + { + let mut u = 0u64; + u.insert(1u32); + assert_eq!(u.pop_lowest(), Some(1u32)); + assert!(u.pop_lowest().is_none()) + } + { + let mut u = 0u64; + u.insert(1u32); + u.insert(1u32); + assert_eq!(u.pop_lowest(), Some(1u32)); + assert!(u.pop_lowest().is_none()) + } + { + let mut u = 0u64; + u.insert(2u32); + assert_eq!(u.pop_lowest(), Some(2u32)); + u.insert(1u32); + assert_eq!(u.pop_lowest(), Some(1u32)); + assert!(u.pop_lowest().is_none()); + } + { + let mut u = 0u64; + u.insert(63u32); + assert_eq!(u.pop_lowest(), Some(63u32)); + assert!(u.pop_lowest().is_none()); + } + } + + + #[test] + fn test_docbitset() { + // docs are assumed to be lower than 100. + let test_against_hashset = |docs: &[DocId], max_doc: u32| { + let mut hashset: HashSet = HashSet::new(); + let mut docbitset = DocBitSet::with_maxdoc(max_doc); + for &doc in docs { + assert!(doc < max_doc); + hashset.insert(doc); + docbitset.insert(doc); + } + for doc in 0..max_doc { + assert_eq!( + hashset.contains(&doc), + docbitset.contains(doc) + ); + } + assert_eq!(docbitset.max_doc(), max_doc); + }; + + test_against_hashset(&[], 0); + test_against_hashset(&[], 1); + test_against_hashset(&[0u32], 1); + test_against_hashset(&[0u32], 100); + test_against_hashset(&[1u32, 2u32], 4); + test_against_hashset(&[99u32], 100); + test_against_hashset(&[63u32], 64); + test_against_hashset(&[62u32,63u32], 64); + } + + #[test] + fn test_docbitset_num_buckets() { + assert_eq!(DocBitSet::with_maxdoc(0u32).num_tiny_bitsets(), 0); + assert_eq!(DocBitSet::with_maxdoc(1u32).num_tiny_bitsets(), 1); + assert_eq!(DocBitSet::with_maxdoc(64u32).num_tiny_bitsets(), 1); + assert_eq!(DocBitSet::with_maxdoc(65u32).num_tiny_bitsets(), 2); + assert_eq!(DocBitSet::with_maxdoc(128u32).num_tiny_bitsets(), 2); + assert_eq!(DocBitSet::with_maxdoc(129u32).num_tiny_bitsets(), 3); + } +} + diff --git a/src/common/mod.rs b/src/common/mod.rs index 39c86aa3f..1975fc78c 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -4,6 +4,7 @@ mod vint; mod counting_writer; mod composite_file; pub mod bitpacker; +mod bitset; pub(crate) use self::composite_file::{CompositeFile, CompositeWrite}; pub use self::serialize::BinarySerializable; @@ -12,6 +13,7 @@ pub use self::timer::TimerTree; pub use self::timer::OpenTimer; pub use self::vint::VInt; pub use self::counting_writer::CountingWriter; +pub use self::bitset::{TinySet, DocBitSet}; use std::io; diff --git a/src/postings/docset.rs b/src/postings/docset.rs index 65c41f76b..daf50c949 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -92,7 +92,7 @@ pub trait DocSet { /// Returns a best-effort hint of the /// length of the docset. - fn size_hint(&self) -> usize; + fn size_hint(&self) -> u32; } impl DocSet for Box { @@ -111,7 +111,7 @@ impl DocSet for Box { unboxed.doc() } - fn size_hint(&self) -> usize { + fn size_hint(&self) -> u32 { let unboxed: &TDocSet = self.borrow(); unboxed.size_hint() } @@ -133,7 +133,7 @@ impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet { unref.doc() } - fn size_hint(&self) -> usize { + fn size_hint(&self) -> u32 { let unref: &TDocSet = *self; unref.size_hint() } diff --git a/src/postings/intersection.rs b/src/postings/intersection.rs index b105405f6..5234f51c0 100644 --- a/src/postings/intersection.rs +++ b/src/postings/intersection.rs @@ -31,7 +31,8 @@ impl IntersectionDocSet { } impl DocSet for IntersectionDocSet { - fn size_hint(&self) -> usize { + /// Returns the minimum `.size_hint()` of the intersected docsets. + fn size_hint(&self) -> u32 { self.docsets .iter() .map(|docset| docset.size_hint()) diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 682fc82f6..e93ba0bff 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -235,8 +235,8 @@ impl DocSet for SegmentPostings { } } - fn size_hint(&self) -> usize { - self.len() + fn size_hint(&self) -> u32 { + self.len() as u32 } /// Return the current document's `DocId`. diff --git a/src/postings/vec_postings.rs b/src/postings/vec_postings.rs index f6c5ae8d9..51c402cd6 100644 --- a/src/postings/vec_postings.rs +++ b/src/postings/vec_postings.rs @@ -35,8 +35,8 @@ impl DocSet for VecPostings { self.doc_ids[self.cursor.0] } - fn size_hint(&self) -> usize { - self.len() + fn size_hint(&self) -> u32 { + self.len() as u32 } } diff --git a/src/query/all_query.rs b/src/query/all_query.rs index 632693cef..972d33918 100644 --- a/src/query/all_query.rs +++ b/src/query/all_query.rs @@ -62,8 +62,8 @@ impl DocSet for AllScorer { self.doc } - fn size_hint(&self) -> usize { - self.max_doc as usize + fn size_hint(&self) -> u32 { + self.max_doc } } diff --git a/src/query/bitset/mod.rs b/src/query/bitset/mod.rs new file mode 100644 index 000000000..59ad8b1bd --- /dev/null +++ b/src/query/bitset/mod.rs @@ -0,0 +1,199 @@ +use common::{DocBitSet, TinySet}; +use DocId; +use postings::DocSet; +use postings::SkipResult; +use std::cmp::Ordering; + +/// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`. +/// +/// # Implementation detail +/// +/// Skipping is relatively fast here as we can directly point to the +/// right tiny bitset bucket. +/// +/// TODO: Consider implementing a `BitTreeSet` in order to advance faster +/// when the bitset is sparse +pub struct BitSetDocSet { + docs: DocBitSet, + cursor_bucket: usize, //< index associated to the current tiny bitset + cursor_tinybitset: u64, + doc: u32 +} + +impl From for BitSetDocSet { + fn from(docs: DocBitSet) -> BitSetDocSet { + let first_tiny_bitset = + if docs.num_tiny_bitsets() == 0 { + 0u64 + } else { + docs.tiny_bitset(0) as u64 + }; + BitSetDocSet { + docs, + cursor_bucket: 0, + cursor_tinybitset: first_tiny_bitset, + doc: 0u32 + } + } +} + +impl DocSet for BitSetDocSet { + fn advance(&mut self) -> bool { + loop { + if let Some(lower) = self.cursor_tinybitset.pop_lowest() { + self.doc = (self.cursor_bucket as u32 * 64u32) | lower; + return true; + } else { + if self.cursor_bucket < self.docs.num_tiny_bitsets() - 1 { + self.cursor_bucket += 1; + self.cursor_tinybitset = self.docs.tiny_bitset(self.cursor_bucket); + } else { + return false; + } + } + + } + } + + + fn skip_next(&mut self, target: DocId) -> SkipResult { + // skip is required to advance. + if !self.advance() { + return SkipResult::End; + } + let target_bucket = (target / 64u32) as usize; + + // Mask for all of the bits greater or equal + // to our target document. + match target_bucket.cmp(&self.cursor_bucket) { + Ordering::Less => { + self.cursor_bucket = target_bucket; + self.cursor_tinybitset = self.docs.tiny_bitset(target_bucket); + let greater: u64 = ::range_greater_or_equal(target % 64); + self.cursor_tinybitset.intersect(greater); + if !self.advance() { + SkipResult::End + } else { + if self.doc() == target { + SkipResult::Reached + } else { + SkipResult::OverStep + } + } + } + Ordering::Equal => { + loop { + match self.doc().cmp(&target) { + Ordering::Less => { + if !self.advance() { + return SkipResult::End; + } + } + Ordering::Equal => { + return SkipResult::Reached; + } + Ordering::Greater => { + return SkipResult::OverStep; + } + } + } + } + Ordering::Greater => SkipResult::OverStep + } + } + + /// Returns the current document + fn doc(&self) -> DocId { + self.doc + } + + /// Advances the cursor to the next document + /// None is returned if the iterator has `DocSet` + /// has already been entirely consumed. + fn next(&mut self) -> Option { + if self.advance() { + Some(self.doc()) + } else { + None + } + } + + /// Returns half of the `max_doc` + /// This is quite a terrible heuristic, + /// but we don't have access to any better + /// value. + fn size_hint(&self) -> u32 { + self.docs.size_hint() + } +} + +#[cfg(test)] +mod tests { + use DocId; + use common::DocBitSet; + use postings::{SkipResult, DocSet}; + use super::BitSetDocSet; + + fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet { + let mut docset = DocBitSet::with_maxdoc(max_doc); + for &doc in docs { + docset.insert(doc); + } + BitSetDocSet::from(docset) + } + + fn test_go_through_sequential(docs: &[DocId]) { + let mut docset = create_docbitset(docs, 1_000u32); + for &doc in docs { + assert!(docset.advance()); + assert_eq!(doc, docset.doc()); + } + assert!(!docset.advance()); + assert!(!docset.advance()); + } + + #[test] + fn test_docbitset_sequential() { + test_go_through_sequential(&[]); + test_go_through_sequential(&[1,2,3]); + test_go_through_sequential(&[1,2,3,4,5,63,64,65]); + test_go_through_sequential(&[63,64,65]); + test_go_through_sequential(&[1,2,3,4,95,96,97,98,99]); + } + + #[test] + fn test_docbitset_skip() { + { + let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000); + assert_eq!(docset.skip_next(7), SkipResult::Reached); + assert_eq!(docset.doc(), 7); + assert!(docset.advance(), 7); + assert_eq!(docset.doc(), 5112); + assert!(!docset.advance()); + } + { + let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000); + assert_eq!(docset.skip_next(3), SkipResult::OverStep); + assert_eq!(docset.doc(), 5); + assert!(docset.advance()); + } + { + let mut docset = create_docbitset(&[5112], 10_000); + assert_eq!(docset.skip_next(5112), SkipResult::Reached); + assert_eq!(docset.doc(), 5112); + assert!(!docset.advance()); + } + { + let mut docset = create_docbitset(&[5112], 10_000); + assert_eq!(docset.skip_next(5113), SkipResult::End); + assert!(!docset.advance()); + } + { + let mut docset = create_docbitset(&[5112], 10_000); + assert_eq!(docset.skip_next(5111), SkipResult::OverStep); + assert_eq!(docset.doc(), 5112); + assert!(!docset.advance()); + } + } + +} \ No newline at end of file diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index af4418d4e..660233446 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -8,7 +8,6 @@ use schema::Term; use query::TermQuery; use schema::IndexRecordOption; use query::Occur; -use query::OccurFilter; /// The boolean query combines a set of queries /// @@ -39,14 +38,11 @@ impl Query for BooleanQuery { fn weight(&self, searcher: &Searcher) -> Result> { let sub_weights = self.subqueries .iter() - .map(|&(ref _occur, ref subquery)| subquery.weight(searcher)) + .map(|&(ref occur, ref subquery)| { + Ok((*occur, subquery.weight(searcher)?)) + }) .collect::>()?; - let occurs: Vec = self.subqueries - .iter() - .map(|&(ref occur, ref _subquery)| *occur) - .collect(); - let filter = OccurFilter::new(&occurs); - Ok(box BooleanWeight::new(sub_weights, filter)) + Ok(box BooleanWeight::new(sub_weights)) } } diff --git a/src/query/boolean_query/boolean_scorer.rs b/src/query/boolean_query/boolean_scorer.rs index 5bc574c68..12228850b 100644 --- a/src/query/boolean_query/boolean_scorer.rs +++ b/src/query/boolean_query/boolean_scorer.rs @@ -90,7 +90,7 @@ impl BooleanScorer { } impl DocSet for BooleanScorer { - fn size_hint(&self) -> usize { + fn size_hint(&self) -> u32 { // TODO fix this. it should be the min // of the MUST scorer // and the max of the SHOULD scorers. diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index ef5d36374..96ae4dae2 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -1,31 +1,49 @@ use query::Weight; use core::SegmentReader; +use query::EmptyScorer; use query::Scorer; use super::BooleanScorer; use query::OccurFilter; +use query::Occur; use Result; pub struct BooleanWeight { - weights: Vec>, - occur_filter: OccurFilter, + weights: Vec<(Occur, Box)>, } impl BooleanWeight { - pub fn new(weights: Vec>, occur_filter: OccurFilter) -> BooleanWeight { + pub fn new(weights: Vec<(Occur, Box)>) -> BooleanWeight { BooleanWeight { - weights, - occur_filter, + weights } } } impl Weight for BooleanWeight { fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result> { - let sub_scorers: Vec> = self.weights - .iter() - .map(|weight| weight.scorer(reader)) - .collect::>()?; - let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter); - Ok(box boolean_scorer) - } + if self.weights.is_empty() { + Ok(box EmptyScorer) + } else if self.weights.len() == 1 { + let &(occur, ref weight) = &self.weights[0]; + if occur == Occur::MustNot { + Ok(box EmptyScorer) + } else { + weight.scorer(reader) + } + } else { + let sub_scorers: Vec> = self.weights + .iter() + .map(|&(_, ref weight)| weight) + .map(|weight| weight.scorer(reader)) + .collect::>()?; + let occurs: Vec = self.weights + .iter() + .map(|&(ref occur, _)| *occur) + .collect(); + let occur_filter = OccurFilter::new(&occurs); + let boolean_scorer = BooleanScorer::new(sub_scorers, occur_filter); + Ok(box boolean_scorer) + } + + } } diff --git a/src/query/mod.rs b/src/query/mod.rs index 7177871b5..aafeb0a7c 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -12,7 +12,9 @@ mod term_query; mod query_parser; mod phrase_query; mod all_query; +mod bitset; +pub use self::bitset::BitSetDocSet; pub use self::boolean_query::BooleanQuery; pub use self::occur_filter::OccurFilter; pub use self::occur::Occur; @@ -24,4 +26,4 @@ pub use self::scorer::EmptyScorer; pub use self::scorer::Scorer; pub use self::term_query::TermQuery; pub use self::weight::Weight; -pub use self::all_query::{AllQuery, AllWeight, AllScorer}; \ No newline at end of file +pub use self::all_query::{AllQuery, AllWeight, AllScorer}; diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 4b7a26095..ece4994d9 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -35,7 +35,7 @@ impl DocSet for PostingsWithOffset { self.segment_postings.doc() } - fn size_hint(&self) -> usize { + fn size_hint(&self) -> u32 { self.segment_postings.size_hint() } @@ -125,7 +125,7 @@ impl DocSet for PhraseScorer { self.intersection_docset.doc() } - fn size_hint(&self) -> usize { + fn size_hint(&self) -> u32 { self.intersection_docset.size_hint() } } diff --git a/src/query/scorer.rs b/src/query/scorer.rs index 170e6aa56..04bd13619 100644 --- a/src/query/scorer.rs +++ b/src/query/scorer.rs @@ -49,7 +49,7 @@ impl DocSet for EmptyScorer { DocId::max_value() } - fn size_hint(&self) -> usize { + fn size_hint(&self) -> u32 { 0 } } diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index 8d99c6c03..d9095f4d2 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -36,7 +36,7 @@ where self.postings.doc() } - fn size_hint(&self) -> usize { + fn size_hint(&self) -> u32 { self.postings.size_hint() } }