Added bitse

This commit is contained in:
Paul Masurel
2018-01-31 23:56:54 +09:00
parent 340693184f
commit 1947a19700
17 changed files with 479 additions and 37 deletions

3
.gitignore vendored
View File

@@ -1,3 +1,4 @@
*.swp
target
target/debug
.vscode
@@ -8,4 +9,4 @@ benchmark
cpp/simdcomp/bitpackingbenchmark
*.bk
.idea
trace.dat
trace.dat

13
.vimrc Normal file
View File

@@ -0,0 +1,13 @@
set wildignore+=*/examples/*
set tabstop=2
set shiftwidth=2
set softtabstop=2
set expandtab
set nosmarttab
set textwidth=100
autocmd BufRead *.rs :setlocal tags=./rusty-tags.vi;/
autocmd BufWritePost *.rs :silent! exec "!rusty-tags vi -o --quiet --start-dir=" . expand('%:p:h') . "&" | redraw!

210
src/common/bitset.rs Normal file
View File

@@ -0,0 +1,210 @@
use DocId;
pub trait TinySet {
fn insert(&mut self, b: u32);
fn is_empty(&self) -> bool;
fn pop_lowest(&mut self) -> Option<u32>;
fn remove(&mut self, b: u32);
fn lowest(&mut self) -> Option<u32>;
/// Update self to represent the
/// intersection of its elements and the other
/// set given in arguments.
fn intersect(&mut self, other: Self);
/// Returns a `TinySet` than contains all values up
/// to limit excluded.
///
/// The limit is assumed to be strictly lower than 64.
fn range_lower(limit: u32) -> u64;
/// Returns a `TinySet` that contains all values greater
/// or equal to the given limit, included. (and up to 63)
///
/// The limit is assumed to be strictly lower than 64.
fn range_greater_or_equal(from_included: u32) -> u64 {
assert!(from_included < 64);
0 ^ Self::range_lower(from_included)
}
}
impl TinySet for u64 {
fn range_lower(from_included: u32) -> u64 {
assert!(from_included < 64);
(1u64 << (from_included as u64)) - 1u64
}
fn intersect(&mut self, filter_mask: u64) {
*self &= filter_mask;
}
#[inline(always)]
fn insert(&mut self, b: u32) {
*self |= 1u64 << (b as u64);
}
#[inline(always)]
fn is_empty(&self) -> bool {
*self == 0u64
}
#[inline(always)]
fn pop_lowest(&mut self) -> Option<u32> {
if let Some(lowest) = self.lowest() {
self.remove(lowest);
Some(lowest)
} else {
None
}
}
#[inline(always)]
fn remove(&mut self, b: u32) {
*self ^= 1 << (b as u64);
}
#[inline(always)]
fn lowest(&mut self) -> Option<u32> {
if self.is_empty() {
None
} else {
let least_significant_bit = self.trailing_zeros() as u32;
Some(least_significant_bit)
}
}
}
pub struct DocBitSet {
tinybitsets: Box<[u64]>,
size_hint: usize, //< Technically it should be u32, but we
// count multiple inserts.
// `usize` guards us from overflow.
max_doc: DocId
}
impl DocBitSet {
pub fn with_maxdoc(max_doc: DocId) -> DocBitSet {
let num_buckets = (max_doc + 63) / 64;
DocBitSet {
tinybitsets: vec![0u64; num_buckets as usize].into_boxed_slice(),
size_hint: 0,
max_doc
}
}
pub fn size_hint(&self) -> u32 {
if self.max_doc as usize > self.size_hint {
self.size_hint as u32
} else {
self.max_doc
}
}
pub fn insert(&mut self, doc: DocId) {
// we do not check saturated els.
self.size_hint += 1;
let bucket = (doc / 64u32) as usize;
self.tinybitsets[bucket].insert(doc % 64u32);
}
pub fn contains(&self, doc: DocId) -> bool {
let tiny_bitset = self.tiny_bitset((doc / 64u32) as usize);
let lower = doc % 64;
let mask = 1u64 << (lower as u64);
(tiny_bitset & mask) != 0u64
}
pub fn max_doc(&self) -> DocId {
self.max_doc
}
pub fn num_tiny_bitsets(&self) -> usize {
self.tinybitsets.len()
}
pub fn tiny_bitset(&self, bucket: usize) -> u64 {
self.tinybitsets[bucket]
}
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use DocId;
use super::TinySet;
use super::DocBitSet;
#[test]
fn test_tiny_set() {
assert!(0u64.is_empty());
{
let mut u = 0u64;
u.insert(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none())
}
{
let mut u = 0u64;
u.insert(1u32);
u.insert(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none())
}
{
let mut u = 0u64;
u.insert(2u32);
assert_eq!(u.pop_lowest(), Some(2u32));
u.insert(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = 0u64;
u.insert(63u32);
assert_eq!(u.pop_lowest(), Some(63u32));
assert!(u.pop_lowest().is_none());
}
}
#[test]
fn test_docbitset() {
// docs are assumed to be lower than 100.
let test_against_hashset = |docs: &[DocId], max_doc: u32| {
let mut hashset: HashSet<DocId> = HashSet::new();
let mut docbitset = DocBitSet::with_maxdoc(max_doc);
for &doc in docs {
assert!(doc < max_doc);
hashset.insert(doc);
docbitset.insert(doc);
}
for doc in 0..max_doc {
assert_eq!(
hashset.contains(&doc),
docbitset.contains(doc)
);
}
assert_eq!(docbitset.max_doc(), max_doc);
};
test_against_hashset(&[], 0);
test_against_hashset(&[], 1);
test_against_hashset(&[0u32], 1);
test_against_hashset(&[0u32], 100);
test_against_hashset(&[1u32, 2u32], 4);
test_against_hashset(&[99u32], 100);
test_against_hashset(&[63u32], 64);
test_against_hashset(&[62u32,63u32], 64);
}
#[test]
fn test_docbitset_num_buckets() {
assert_eq!(DocBitSet::with_maxdoc(0u32).num_tiny_bitsets(), 0);
assert_eq!(DocBitSet::with_maxdoc(1u32).num_tiny_bitsets(), 1);
assert_eq!(DocBitSet::with_maxdoc(64u32).num_tiny_bitsets(), 1);
assert_eq!(DocBitSet::with_maxdoc(65u32).num_tiny_bitsets(), 2);
assert_eq!(DocBitSet::with_maxdoc(128u32).num_tiny_bitsets(), 2);
assert_eq!(DocBitSet::with_maxdoc(129u32).num_tiny_bitsets(), 3);
}
}

View File

@@ -4,6 +4,7 @@ mod vint;
mod counting_writer;
mod composite_file;
pub mod bitpacker;
mod bitset;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::serialize::BinarySerializable;
@@ -12,6 +13,7 @@ pub use self::timer::TimerTree;
pub use self::timer::OpenTimer;
pub use self::vint::VInt;
pub use self::counting_writer::CountingWriter;
pub use self::bitset::{TinySet, DocBitSet};
use std::io;

View File

@@ -92,7 +92,7 @@ pub trait DocSet {
/// Returns a best-effort hint of the
/// length of the docset.
fn size_hint(&self) -> usize;
fn size_hint(&self) -> u32;
}
impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
@@ -111,7 +111,7 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.doc()
}
fn size_hint(&self) -> usize {
fn size_hint(&self) -> u32 {
let unboxed: &TDocSet = self.borrow();
unboxed.size_hint()
}
@@ -133,7 +133,7 @@ impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
unref.doc()
}
fn size_hint(&self) -> usize {
fn size_hint(&self) -> u32 {
let unref: &TDocSet = *self;
unref.size_hint()
}

View File

@@ -31,7 +31,8 @@ impl<TDocSet: DocSet> IntersectionDocSet<TDocSet> {
}
impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> {
fn size_hint(&self) -> usize {
/// Returns the minimum `.size_hint()` of the intersected docsets.
fn size_hint(&self) -> u32 {
self.docsets
.iter()
.map(|docset| docset.size_hint())

View File

@@ -235,8 +235,8 @@ impl DocSet for SegmentPostings {
}
}
fn size_hint(&self) -> usize {
self.len()
fn size_hint(&self) -> u32 {
self.len() as u32
}
/// Return the current document's `DocId`.

View File

@@ -35,8 +35,8 @@ impl DocSet for VecPostings {
self.doc_ids[self.cursor.0]
}
fn size_hint(&self) -> usize {
self.len()
fn size_hint(&self) -> u32 {
self.len() as u32
}
}

View File

@@ -62,8 +62,8 @@ impl DocSet for AllScorer {
self.doc
}
fn size_hint(&self) -> usize {
self.max_doc as usize
fn size_hint(&self) -> u32 {
self.max_doc
}
}

199
src/query/bitset/mod.rs Normal file
View File

@@ -0,0 +1,199 @@
use common::{DocBitSet, TinySet};
use DocId;
use postings::DocSet;
use postings::SkipResult;
use std::cmp::Ordering;
/// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`.
///
/// # Implementation detail
///
/// Skipping is relatively fast here as we can directly point to the
/// right tiny bitset bucket.
///
/// TODO: Consider implementing a `BitTreeSet` in order to advance faster
/// when the bitset is sparse
pub struct BitSetDocSet {
docs: DocBitSet,
cursor_bucket: usize, //< index associated to the current tiny bitset
cursor_tinybitset: u64,
doc: u32
}
impl From<DocBitSet> for BitSetDocSet {
fn from(docs: DocBitSet) -> BitSetDocSet {
let first_tiny_bitset =
if docs.num_tiny_bitsets() == 0 {
0u64
} else {
docs.tiny_bitset(0) as u64
};
BitSetDocSet {
docs,
cursor_bucket: 0,
cursor_tinybitset: first_tiny_bitset,
doc: 0u32
}
}
}
impl DocSet for BitSetDocSet {
fn advance(&mut self) -> bool {
loop {
if let Some(lower) = self.cursor_tinybitset.pop_lowest() {
self.doc = (self.cursor_bucket as u32 * 64u32) | lower;
return true;
} else {
if self.cursor_bucket < self.docs.num_tiny_bitsets() - 1 {
self.cursor_bucket += 1;
self.cursor_tinybitset = self.docs.tiny_bitset(self.cursor_bucket);
} else {
return false;
}
}
}
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
// skip is required to advance.
if !self.advance() {
return SkipResult::End;
}
let target_bucket = (target / 64u32) as usize;
// Mask for all of the bits greater or equal
// to our target document.
match target_bucket.cmp(&self.cursor_bucket) {
Ordering::Less => {
self.cursor_bucket = target_bucket;
self.cursor_tinybitset = self.docs.tiny_bitset(target_bucket);
let greater: u64 = <u64 as TinySet>::range_greater_or_equal(target % 64);
self.cursor_tinybitset.intersect(greater);
if !self.advance() {
SkipResult::End
} else {
if self.doc() == target {
SkipResult::Reached
} else {
SkipResult::OverStep
}
}
}
Ordering::Equal => {
loop {
match self.doc().cmp(&target) {
Ordering::Less => {
if !self.advance() {
return SkipResult::End;
}
}
Ordering::Equal => {
return SkipResult::Reached;
}
Ordering::Greater => {
return SkipResult::OverStep;
}
}
}
}
Ordering::Greater => SkipResult::OverStep
}
}
/// Returns the current document
fn doc(&self) -> DocId {
self.doc
}
/// Advances the cursor to the next document
/// None is returned if the iterator has `DocSet`
/// has already been entirely consumed.
fn next(&mut self) -> Option<DocId> {
if self.advance() {
Some(self.doc())
} else {
None
}
}
/// Returns half of the `max_doc`
/// This is quite a terrible heuristic,
/// but we don't have access to any better
/// value.
fn size_hint(&self) -> u32 {
self.docs.size_hint()
}
}
#[cfg(test)]
mod tests {
use DocId;
use common::DocBitSet;
use postings::{SkipResult, DocSet};
use super::BitSetDocSet;
fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet {
let mut docset = DocBitSet::with_maxdoc(max_doc);
for &doc in docs {
docset.insert(doc);
}
BitSetDocSet::from(docset)
}
fn test_go_through_sequential(docs: &[DocId]) {
let mut docset = create_docbitset(docs, 1_000u32);
for &doc in docs {
assert!(docset.advance());
assert_eq!(doc, docset.doc());
}
assert!(!docset.advance());
assert!(!docset.advance());
}
#[test]
fn test_docbitset_sequential() {
test_go_through_sequential(&[]);
test_go_through_sequential(&[1,2,3]);
test_go_through_sequential(&[1,2,3,4,5,63,64,65]);
test_go_through_sequential(&[63,64,65]);
test_go_through_sequential(&[1,2,3,4,95,96,97,98,99]);
}
#[test]
fn test_docbitset_skip() {
{
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000);
assert_eq!(docset.skip_next(7), SkipResult::Reached);
assert_eq!(docset.doc(), 7);
assert!(docset.advance(), 7);
assert_eq!(docset.doc(), 5112);
assert!(!docset.advance());
}
{
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000);
assert_eq!(docset.skip_next(3), SkipResult::OverStep);
assert_eq!(docset.doc(), 5);
assert!(docset.advance());
}
{
let mut docset = create_docbitset(&[5112], 10_000);
assert_eq!(docset.skip_next(5112), SkipResult::Reached);
assert_eq!(docset.doc(), 5112);
assert!(!docset.advance());
}
{
let mut docset = create_docbitset(&[5112], 10_000);
assert_eq!(docset.skip_next(5113), SkipResult::End);
assert!(!docset.advance());
}
{
let mut docset = create_docbitset(&[5112], 10_000);
assert_eq!(docset.skip_next(5111), SkipResult::OverStep);
assert_eq!(docset.doc(), 5112);
assert!(!docset.advance());
}
}
}

View File

@@ -8,7 +8,6 @@ use schema::Term;
use query::TermQuery;
use schema::IndexRecordOption;
use query::Occur;
use query::OccurFilter;
/// The boolean query combines a set of queries
///
@@ -39,14 +38,11 @@ impl Query for BooleanQuery {
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
let sub_weights = self.subqueries
.iter()
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
.map(|&(ref occur, ref subquery)| {
Ok((*occur, subquery.weight(searcher)?))
})
.collect::<Result<_>>()?;
let occurs: Vec<Occur> = self.subqueries
.iter()
.map(|&(ref occur, ref _subquery)| *occur)
.collect();
let filter = OccurFilter::new(&occurs);
Ok(box BooleanWeight::new(sub_weights, filter))
Ok(box BooleanWeight::new(sub_weights))
}
}

View File

@@ -90,7 +90,7 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
}
impl<TScorer: Scorer> DocSet for BooleanScorer<TScorer> {
fn size_hint(&self) -> usize {
fn size_hint(&self) -> u32 {
// TODO fix this. it should be the min
// of the MUST scorer
// and the max of the SHOULD scorers.

View File

@@ -1,31 +1,49 @@
use query::Weight;
use core::SegmentReader;
use query::EmptyScorer;
use query::Scorer;
use super::BooleanScorer;
use query::OccurFilter;
use query::Occur;
use Result;
pub struct BooleanWeight {
weights: Vec<Box<Weight>>,
occur_filter: OccurFilter,
weights: Vec<(Occur, Box<Weight>)>,
}
impl BooleanWeight {
pub fn new(weights: Vec<Box<Weight>>, occur_filter: OccurFilter) -> BooleanWeight {
pub fn new(weights: Vec<(Occur, Box<Weight>)>) -> BooleanWeight {
BooleanWeight {
weights,
occur_filter,
weights
}
}
}
impl Weight for BooleanWeight {
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
let sub_scorers: Vec<Box<Scorer + 'a>> = self.weights
.iter()
.map(|weight| weight.scorer(reader))
.collect::<Result<_>>()?;
let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter);
Ok(box boolean_scorer)
}
if self.weights.is_empty() {
Ok(box EmptyScorer)
} else if self.weights.len() == 1 {
let &(occur, ref weight) = &self.weights[0];
if occur == Occur::MustNot {
Ok(box EmptyScorer)
} else {
weight.scorer(reader)
}
} else {
let sub_scorers: Vec<Box<Scorer + 'a>> = self.weights
.iter()
.map(|&(_, ref weight)| weight)
.map(|weight| weight.scorer(reader))
.collect::<Result<_>>()?;
let occurs: Vec<Occur> = self.weights
.iter()
.map(|&(ref occur, _)| *occur)
.collect();
let occur_filter = OccurFilter::new(&occurs);
let boolean_scorer = BooleanScorer::new(sub_scorers, occur_filter);
Ok(box boolean_scorer)
}
}
}

View File

@@ -12,7 +12,9 @@ mod term_query;
mod query_parser;
mod phrase_query;
mod all_query;
mod bitset;
pub use self::bitset::BitSetDocSet;
pub use self::boolean_query::BooleanQuery;
pub use self::occur_filter::OccurFilter;
pub use self::occur::Occur;
@@ -24,4 +26,4 @@ pub use self::scorer::EmptyScorer;
pub use self::scorer::Scorer;
pub use self::term_query::TermQuery;
pub use self::weight::Weight;
pub use self::all_query::{AllQuery, AllWeight, AllScorer};
pub use self::all_query::{AllQuery, AllWeight, AllScorer};

View File

@@ -35,7 +35,7 @@ impl DocSet for PostingsWithOffset {
self.segment_postings.doc()
}
fn size_hint(&self) -> usize {
fn size_hint(&self) -> u32 {
self.segment_postings.size_hint()
}
@@ -125,7 +125,7 @@ impl DocSet for PhraseScorer {
self.intersection_docset.doc()
}
fn size_hint(&self) -> usize {
fn size_hint(&self) -> u32 {
self.intersection_docset.size_hint()
}
}

View File

@@ -49,7 +49,7 @@ impl DocSet for EmptyScorer {
DocId::max_value()
}
fn size_hint(&self) -> usize {
fn size_hint(&self) -> u32 {
0
}
}

View File

@@ -36,7 +36,7 @@ where
self.postings.doc()
}
fn size_hint(&self) -> usize {
fn size_hint(&self) -> u32 {
self.postings.size_hint()
}
}