mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
253 lines
7.9 KiB
Rust
253 lines
7.9 KiB
Rust
use crate::common::{BitSet, TinySet};
|
|
use crate::docset::{DocSet, TERMINATED};
|
|
use crate::DocId;
|
|
|
|
/// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`.
|
|
///
|
|
/// # Implementation detail
|
|
///
|
|
/// Skipping is relatively fast here as we can directly point to the
|
|
/// right tiny bitset bucket.
|
|
///
|
|
/// TODO: Consider implementing a `BitTreeSet` in order to advance faster
|
|
/// when the bitset is sparse
|
|
pub struct BitSetDocSet {
|
|
docs: BitSet,
|
|
cursor_bucket: u32, //< index associated to the current tiny bitset
|
|
cursor_tinybitset: TinySet,
|
|
doc: u32,
|
|
}
|
|
|
|
impl BitSetDocSet {
|
|
fn go_to_bucket(&mut self, bucket_addr: u32) {
|
|
self.cursor_bucket = bucket_addr;
|
|
self.cursor_tinybitset = self.docs.tinyset(bucket_addr);
|
|
}
|
|
}
|
|
|
|
impl From<BitSet> for BitSetDocSet {
|
|
fn from(docs: BitSet) -> BitSetDocSet {
|
|
let first_tiny_bitset = if docs.max_value() == 0 {
|
|
TinySet::empty()
|
|
} else {
|
|
docs.tinyset(0)
|
|
};
|
|
let mut docset = BitSetDocSet {
|
|
docs,
|
|
cursor_bucket: 0,
|
|
cursor_tinybitset: first_tiny_bitset,
|
|
doc: 0u32,
|
|
};
|
|
docset.advance();
|
|
docset
|
|
}
|
|
}
|
|
|
|
impl DocSet for BitSetDocSet {
|
|
fn advance(&mut self) -> DocId {
|
|
if let Some(lower) = self.cursor_tinybitset.pop_lowest() {
|
|
self.doc = (self.cursor_bucket as u32 * 64u32) | lower;
|
|
return self.doc;
|
|
}
|
|
if let Some(cursor_bucket) = self.docs.first_non_empty_bucket(self.cursor_bucket + 1) {
|
|
self.go_to_bucket(cursor_bucket);
|
|
let lower = self.cursor_tinybitset.pop_lowest().unwrap();
|
|
self.doc = (cursor_bucket * 64u32) | lower;
|
|
self.doc
|
|
} else {
|
|
self.doc = TERMINATED;
|
|
TERMINATED
|
|
}
|
|
}
|
|
|
|
fn seek(&mut self, target: DocId) -> DocId {
|
|
if target >= self.docs.max_value() {
|
|
self.doc = TERMINATED;
|
|
return TERMINATED;
|
|
}
|
|
let target_bucket = target / 64u32;
|
|
if target_bucket > self.cursor_bucket {
|
|
self.go_to_bucket(target_bucket);
|
|
let greater_filter: TinySet = TinySet::range_greater_or_equal(target);
|
|
self.cursor_tinybitset = self.cursor_tinybitset.intersect(greater_filter);
|
|
self.advance()
|
|
} else {
|
|
let mut doc = self.doc();
|
|
while doc < target {
|
|
doc = self.advance();
|
|
}
|
|
doc
|
|
}
|
|
}
|
|
|
|
/// Returns the current document
|
|
fn doc(&self) -> DocId {
|
|
self.doc
|
|
}
|
|
|
|
/// Returns half of the `max_doc`
|
|
/// This is quite a terrible heuristic,
|
|
/// but we don't have access to any better
|
|
/// value.
|
|
fn size_hint(&self) -> u32 {
|
|
self.docs.len() as u32
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::BitSetDocSet;
|
|
use crate::common::BitSet;
|
|
use crate::docset::{DocSet, TERMINATED};
|
|
use crate::DocId;
|
|
|
|
fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet {
|
|
let mut docset = BitSet::with_max_value(max_doc);
|
|
for &doc in docs {
|
|
docset.insert(doc);
|
|
}
|
|
BitSetDocSet::from(docset)
|
|
}
|
|
|
|
#[test]
|
|
fn test_empty() {
|
|
let bitset = BitSet::with_max_value(1000);
|
|
let mut empty = BitSetDocSet::from(bitset);
|
|
assert_eq!(empty.advance(), TERMINATED)
|
|
}
|
|
|
|
#[test]
|
|
fn test_seek_terminated() {
|
|
let bitset = BitSet::with_max_value(1000);
|
|
let mut empty = BitSetDocSet::from(bitset);
|
|
assert_eq!(empty.seek(TERMINATED), TERMINATED)
|
|
}
|
|
|
|
fn test_go_through_sequential(docs: &[DocId]) {
|
|
let mut docset = create_docbitset(docs, 1_000u32);
|
|
for &doc in docs {
|
|
assert_eq!(doc, docset.doc());
|
|
docset.advance();
|
|
}
|
|
assert_eq!(docset.advance(), TERMINATED);
|
|
}
|
|
|
|
#[test]
|
|
fn test_docbitset_sequential() {
|
|
test_go_through_sequential(&[1, 2, 3]);
|
|
test_go_through_sequential(&[1, 2, 3, 4, 5, 63, 64, 65]);
|
|
test_go_through_sequential(&[63, 64, 65]);
|
|
test_go_through_sequential(&[1, 2, 3, 4, 95, 96, 97, 98, 99]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_docbitset_skip() {
|
|
{
|
|
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000);
|
|
assert_eq!(docset.seek(7), 7);
|
|
assert_eq!(docset.doc(), 7);
|
|
assert_eq!(docset.advance(), 5112);
|
|
assert_eq!(docset.doc(), 5112);
|
|
assert_eq!(docset.advance(), TERMINATED);
|
|
}
|
|
{
|
|
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000);
|
|
assert_eq!(docset.seek(3), 5);
|
|
assert_eq!(docset.doc(), 5);
|
|
assert_eq!(docset.advance(), 6);
|
|
}
|
|
{
|
|
let mut docset = create_docbitset(&[5112], 10_000);
|
|
assert_eq!(docset.seek(5112), 5112);
|
|
assert_eq!(docset.doc(), 5112);
|
|
assert_eq!(docset.advance(), TERMINATED);
|
|
}
|
|
{
|
|
let mut docset = create_docbitset(&[5112], 10_000);
|
|
assert_eq!(docset.seek(5113), TERMINATED);
|
|
assert_eq!(docset.advance(), TERMINATED);
|
|
}
|
|
{
|
|
let mut docset = create_docbitset(&[5112], 10_000);
|
|
assert_eq!(docset.seek(5111), 5112);
|
|
assert_eq!(docset.doc(), 5112);
|
|
assert_eq!(docset.advance(), TERMINATED);
|
|
}
|
|
{
|
|
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5500, 6666], 10_000);
|
|
assert_eq!(docset.seek(5112), 5112);
|
|
assert_eq!(docset.doc(), 5112);
|
|
assert_eq!(docset.advance(), 5500);
|
|
assert_eq!(docset.doc(), 5500);
|
|
assert_eq!(docset.advance(), 6666);
|
|
assert_eq!(docset.doc(), 6666);
|
|
assert_eq!(docset.advance(), TERMINATED);
|
|
}
|
|
{
|
|
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5500, 6666], 10_000);
|
|
assert_eq!(docset.seek(5111), 5112);
|
|
assert_eq!(docset.doc(), 5112);
|
|
assert_eq!(docset.advance(), 5500);
|
|
assert_eq!(docset.doc(), 5500);
|
|
assert_eq!(docset.advance(), 6666);
|
|
assert_eq!(docset.doc(), 6666);
|
|
assert_eq!(docset.advance(), TERMINATED);
|
|
}
|
|
{
|
|
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5513, 6666], 10_000);
|
|
assert_eq!(docset.seek(5111), 5112);
|
|
assert_eq!(docset.doc(), 5112);
|
|
assert_eq!(docset.advance(), 5513);
|
|
assert_eq!(docset.doc(), 5513);
|
|
assert_eq!(docset.advance(), 6666);
|
|
assert_eq!(docset.doc(), 6666);
|
|
assert_eq!(docset.advance(), TERMINATED);
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(all(test, feature = "unstable"))]
|
|
mod bench {
|
|
|
|
use super::BitSet;
|
|
use super::BitSetDocSet;
|
|
use crate::docset::TERMINATED;
|
|
use crate::test;
|
|
use crate::tests;
|
|
use crate::DocSet;
|
|
|
|
#[bench]
|
|
fn bench_bitset_1pct_insert(b: &mut test::Bencher) {
|
|
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
|
|
b.iter(|| {
|
|
let mut bitset = BitSet::with_max_value(1_000_000);
|
|
for el in els.iter().cloned() {
|
|
bitset.insert(el);
|
|
}
|
|
});
|
|
}
|
|
|
|
#[bench]
|
|
fn bench_bitset_1pct_clone(b: &mut test::Bencher) {
|
|
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
|
|
let mut bitset = BitSet::with_max_value(1_000_000);
|
|
for el in els {
|
|
bitset.insert(el);
|
|
}
|
|
b.iter(|| bitset.clone());
|
|
}
|
|
|
|
#[bench]
|
|
fn bench_bitset_1pct_clone_iterate(b: &mut test::Bencher) {
|
|
let els = tests::sample(1_000_000u32, 0.01);
|
|
let mut bitset = BitSet::with_max_value(1_000_000);
|
|
for el in els {
|
|
bitset.insert(el);
|
|
}
|
|
b.iter(|| {
|
|
let mut docset = BitSetDocSet::from(bitset.clone());
|
|
while docset.advance() != TERMINATED {}
|
|
});
|
|
}
|
|
}
|