create and use ReadSerializedBitSet

This commit is contained in:
Pascal Seitz
2021-09-24 12:53:33 +08:00
parent c217bfed1e
commit 5ee5037934
9 changed files with 195 additions and 123 deletions

View File

@@ -10,6 +10,7 @@ description = "common traits and utility functions used by multiple tantivy subc
[dependencies]
byteorder = "1.4.3"
ownedbytes = { version="0.1", path="../ownedbytes" }
[dev-dependencies]
proptest = "1.0.0"

View File

@@ -1,3 +1,4 @@
use ownedbytes::OwnedBytes;
use std::convert::TryInto;
use std::io::Write;
use std::u64;
@@ -64,34 +65,28 @@ impl TinySet {
}
#[inline]
/// Returns true iff the `TinySet` bit is set at position `pos`.
pub fn contains(self, pos: u32) -> bool {
!self.intersect(TinySet::singleton(pos)).is_empty()
/// Returns true iff the `TinySet` contains the element `el`.
pub fn contains(self, el: u32) -> bool {
!self.intersect(TinySet::singleton(el)).is_empty()
}
#[inline]
/// Returns the number of set bits in the TinySet.
pub fn num_set(self) -> u32 {
/// Returns the number of elements in the TinySet.
pub fn len(self) -> u32 {
self.0.count_ones()
}
#[inline]
/// Returns the number of unset bits in the TinySet.
pub fn num_unset(self) -> u32 {
self.0.count_zeros()
}
#[inline]
/// Returns the intersection of `self` and `other`
pub fn intersect(self, other: TinySet) -> TinySet {
TinySet(self.0 & other.0)
}
/// Creates a new `TinySet` with only one bit set at `pos`.
/// Creates a new `TinySet` containing only one element
/// within `[0; 64[`
#[inline]
pub fn singleton(pos: u32) -> TinySet {
TinySet(1u64 << u64::from(pos))
pub fn singleton(el: u32) -> TinySet {
TinySet(1u64 << u64::from(el))
}
/// Insert a new element within [0..64)
@@ -108,7 +103,7 @@ impl TinySet {
/// Insert a new element within [0..64)
///
/// returns true if the bit changed
/// returns true if the set changed
#[inline]
pub fn insert_mut(&mut self, el: u32) -> bool {
let old = *self;
@@ -116,9 +111,9 @@ impl TinySet {
old != *self
}
/// Remove a new element within [0..64)
/// Remove a element within [0..64)
///
/// returns true if the bit changed
/// returns true if the set changed
#[inline]
pub fn remove_mut(&mut self, el: u32) -> bool {
let old = *self;
@@ -203,7 +198,7 @@ impl BitSet {
let mut tinysets = vec![];
for chunk in data.chunks_exact(8) {
let tinyset = TinySet::deserialize(chunk.try_into().unwrap())?;
len += tinyset.num_set() as u64;
len += tinyset.len() as u64;
tinysets.push(tinyset);
}
Ok(BitSet {
@@ -213,43 +208,6 @@ impl BitSet {
})
}
/// Count the number of unset bits from serialized data.
///
#[inline]
pub fn count_unset_from_bytes<'a>(data: &'a [u8]) -> usize {
BitSet::iter_tinysets_from_bytes(data)
.map(|tinyset| tinyset.num_unset() as usize)
.sum()
}
/// Iterate the tinyset on the fly from serialized data.
///
#[inline]
fn iter_tinysets_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = TinySet> + 'a {
assert!((data.len() - 4) % 8 == 0);
data[4..].chunks_exact(8).map(move |chunk| {
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap();
tinyset
})
}
/// Iterate over the positions of the unset elements.
///
/// max_val needs to be provided, since the last 64 bits may
#[inline]
pub fn iter_unset_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = u32> + 'a {
let max_val: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
Self::iter_tinysets_from_bytes(data)
.enumerate()
.flat_map(move |(chunk_num, tinyset)| {
let chunk_base_val = chunk_num as u32 * 64;
tinyset
.into_iter()
.map(move |val| val + chunk_base_val)
.take_while(move |doc| *doc < max_val)
})
}
/// Create a new `BitSet` that may contain elements
/// within `[0, max_val)`.
pub fn with_max_value(max_value: u32) -> BitSet {
@@ -262,9 +220,9 @@ impl BitSet {
}
}
/// Create a new `BitSet` that may contain elements
/// Create a new `BitSet` that may contain elements. Initially all values will be set.
/// within `[0, max_val)`.
pub fn with_max_value_and_filled(max_value: u32) -> BitSet {
pub fn with_max_value_and_full(max_value: u32) -> BitSet {
let num_buckets = num_buckets(max_value);
let tinybisets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice();
BitSet {
@@ -282,11 +240,12 @@ impl BitSet {
}
/// Returns the number of elements in the `BitSet`.
pub fn num_set_bits(&self) -> usize {
pub fn len(&self) -> usize {
self.len as usize
}
/// Inserts an element in the `BitSet`
#[inline]
pub fn insert(&mut self, el: u32) {
// we do not check saturated els.
let higher = el / 64u32;
@@ -299,6 +258,7 @@ impl BitSet {
}
/// Inserts an element in the `BitSet`
#[inline]
pub fn remove(&mut self, el: u32) {
// we do not check saturated els.
let higher = el / 64u32;
@@ -312,14 +272,6 @@ impl BitSet {
/// Returns true iff the elements is in the `BitSet`.
#[inline]
pub fn contains_from_bytes(el: u32, data: &[u8]) -> bool {
let byte_offset = 4 + el / 8u32;
let b: u8 = data[byte_offset as usize];
let shift = (el % 8) as u8;
b & (1u8 << shift) != 0
}
/// Returns true iff the elements is in the `BitSet`.
pub fn contains(&self, el: u32) -> bool {
self.tinyset(el / 64u32).contains(el % 64)
}
@@ -349,17 +301,133 @@ impl BitSet {
}
}
/// Lazy Read a serialized BitSet.
#[derive(Clone)]
pub struct ReadSerializedBitSet {
data: OwnedBytes,
max_value: u32,
}
impl ReadSerializedBitSet {
pub fn new(data: OwnedBytes) -> Self {
let (max_value_data, data) = data.split(4);
let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap());
ReadSerializedBitSet { data, max_value }
}
/// Count the number of unset bits from serialized data.
///
#[inline]
pub fn count_unset(&self) -> usize {
let lower = self.max_value % 64u32;
let num_set: usize = self
.iter_tinysets()
.map(|(tinyset, is_last)| {
if is_last {
tinyset.intersect(TinySet::range_lower(lower)).len() as usize
} else {
tinyset.len() as usize
}
})
.sum();
self.max_value as usize - num_set
}
/// Iterate the tinyset on the fly from serialized data.
///
/// Iterator returns (TinySet, is_last) element, so the consumer can ignore up to max_doc in the
/// last block.
///
#[inline]
fn iter_tinysets<'a>(&'a self) -> impl Iterator<Item = (TinySet, bool)> + 'a {
assert!((self.data.len()) % 8 == 0);
self.data
.chunks_exact(8)
.enumerate()
.map(move |(chunk_num, chunk)| {
let is_last = (chunk_num + 1) * 8 == self.data.len();
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap();
(tinyset, is_last)
})
}
/// Iterate over the positions of the unset elements.
///
#[inline]
pub fn iter_unset<'a>(&'a self) -> impl Iterator<Item = u32> + 'a {
self.iter_tinysets()
.enumerate()
.flat_map(move |(chunk_num, (tinyset, _))| {
let chunk_base_val = chunk_num as u32 * 64;
tinyset
.into_iter()
.map(move |val| val + chunk_base_val)
.take_while(move |doc| *doc < self.max_value)
})
}
/// Returns true iff the elements is in the `BitSet`.
#[inline]
pub fn contains(&self, el: u32) -> bool {
let byte_offset = el / 8u32;
let b: u8 = self.data[byte_offset as usize];
let shift = (el % 8) as u8;
b & (1u8 << shift) != 0
}
/// Returns the max_value.
#[inline]
pub fn max_value(&self) -> u32 {
self.max_value
}
}
#[cfg(test)]
mod tests {
use super::BitSet;
use super::ReadSerializedBitSet;
use super::TinySet;
use ownedbytes::OwnedBytes;
use rand::distributions::Bernoulli;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use std::collections::HashSet;
use std::convert::TryInto;
#[test]
fn test_read_serialized_bitset_full() {
let mut bitset = BitSet::with_max_value_and_full(5);
bitset.remove(3);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadSerializedBitSet::new(OwnedBytes::new(out));
assert_eq!(bitset.count_unset(), 1);
}
#[test]
fn test_read_serialized_bitset_empty() {
let mut bitset = BitSet::with_max_value(5);
bitset.insert(3);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadSerializedBitSet::new(OwnedBytes::new(out));
assert_eq!(bitset.count_unset(), 4);
{
let bitset = BitSet::with_max_value(5);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadSerializedBitSet::new(OwnedBytes::new(out));
assert_eq!(bitset.count_unset(), 5);
}
}
#[test]
fn test_tiny_set_remove() {
{
@@ -452,7 +520,7 @@ mod tests {
assert_eq!(hashset.contains(&el), bitset.contains(el));
}
assert_eq!(bitset.max_value(), max_value);
assert_eq!(bitset.num_set_bits(), els.len());
assert_eq!(bitset.len(), els.len());
};
test_against_hashset(&[], 0);
@@ -506,25 +574,25 @@ mod tests {
#[test]
fn test_bitset_len() {
let mut bitset = BitSet::with_max_value(1_000);
assert_eq!(bitset.num_set_bits(), 0);
assert_eq!(bitset.len(), 0);
bitset.insert(3u32);
assert_eq!(bitset.num_set_bits(), 1);
assert_eq!(bitset.len(), 1);
bitset.insert(103u32);
assert_eq!(bitset.num_set_bits(), 2);
assert_eq!(bitset.len(), 2);
bitset.insert(3u32);
assert_eq!(bitset.num_set_bits(), 2);
assert_eq!(bitset.len(), 2);
bitset.insert(103u32);
assert_eq!(bitset.num_set_bits(), 2);
assert_eq!(bitset.len(), 2);
bitset.insert(104u32);
assert_eq!(bitset.num_set_bits(), 3);
assert_eq!(bitset.len(), 3);
bitset.remove(105u32);
assert_eq!(bitset.num_set_bits(), 3);
assert_eq!(bitset.len(), 3);
bitset.remove(104u32);
assert_eq!(bitset.num_set_bits(), 2);
assert_eq!(bitset.len(), 2);
bitset.remove(3u32);
assert_eq!(bitset.num_set_bits(), 1);
assert_eq!(bitset.len(), 1);
bitset.remove(103u32);
assert_eq!(bitset.num_set_bits(), 0);
assert_eq!(bitset.len(), 0);
}
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {

View File

@@ -73,7 +73,7 @@ impl SegmentReader {
/// deleted in the segment.
pub fn num_deleted_docs(&self) -> DocId {
self.alive_bitset()
.map(|delete_set| delete_set.num_deleted() as DocId)
.map(|alive_set| alive_set.num_deleted() as DocId)
.unwrap_or(0u32)
}
@@ -289,7 +289,7 @@ impl SegmentReader {
/// Returns an iterator that will iterate over the alive document ids
pub fn doc_ids_alive(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
if let Some(alive_bitset) = &self.alive_bitset_opt {
Box::new(alive_bitset.iter_unset())
Box::new(alive_bitset.iter_alive())
} else {
Box::new(0u32..self.max_doc)
}

View File

@@ -89,7 +89,7 @@ pub trait DocSet: Send {
let mut count = 0u32;
let mut doc = self.doc();
while doc != TERMINATED {
if !alive_bitset.is_deleted(doc) {
if alive_bitset.is_alive(doc) {
count += 1u32;
}
doc = self.advance();

View File

@@ -3,6 +3,7 @@ use crate::directory::OwnedBytes;
use crate::space_usage::ByteCount;
use crate::DocId;
use common::BitSet;
use common::ReadSerializedBitSet;
use std::io;
use std::io::Write;
@@ -21,16 +22,17 @@ pub fn write_alive_bitset<T: Write>(alive_bitset: &BitSet, writer: &mut T) -> io
pub struct AliveBitSet {
data: OwnedBytes,
num_deleted: usize,
bitset: ReadSerializedBitSet,
}
impl AliveBitSet {
#[cfg(test)]
pub(crate) fn for_test(not_alive_docs: &[DocId], max_doc: u32) -> AliveBitSet {
pub(crate) fn for_test(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet {
use crate::directory::{Directory, RamDirectory, TerminatingWrite};
use std::path::Path;
assert!(not_alive_docs.iter().all(|&doc| doc < max_doc));
let mut bitset = BitSet::with_max_value_and_filled(max_doc);
for &doc in not_alive_docs {
assert!(deleted_docs.iter().all(|&doc| doc < max_doc));
let mut bitset = BitSet::with_max_value_and_full(max_doc);
for &doc in deleted_docs {
bitset.remove(doc);
}
let directory = RamDirectory::create();
@@ -45,32 +47,38 @@ impl AliveBitSet {
/// Opens a delete bitset given its file.
pub fn open(file: FileSlice) -> crate::Result<AliveBitSet> {
let bytes = file.read_bytes()?;
let num_deleted = BitSet::count_unset_from_bytes(bytes.as_slice());
let bitset = ReadSerializedBitSet::new(bytes.clone());
let num_deleted = bitset.count_unset();
Ok(AliveBitSet {
data: bytes,
num_deleted,
bitset,
})
}
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
#[inline]
pub fn is_alive(&self, doc: DocId) -> bool {
!self.is_deleted(doc)
self.bitset.contains(doc)
}
/// Returns true iff the document has been marked as deleted.
#[inline]
pub fn is_deleted(&self, doc: DocId) -> bool {
let data = self.data.as_slice();
!BitSet::contains_from_bytes(doc, data)
!self.is_alive(doc)
}
/// Iterate over the positions of the set elements
/// Iterate over the alive docids.
#[inline]
pub fn iter_unset(&self) -> impl Iterator<Item = u32> + '_ {
let data = self.data.as_slice();
BitSet::iter_unset_from_bytes(data)
pub fn iter_alive(&self) -> impl Iterator<Item = DocId> + '_ {
self.bitset.iter_unset()
}
/// Get underlying bitset
#[inline]
pub fn bitset(&self) -> &ReadSerializedBitSet {
&self.bitset
}
/// The number of deleted docs
@@ -121,7 +129,7 @@ mod tests {
fn test_alive_bitset_iter_minimal() {
let alive_bitset = AliveBitSet::for_test(&[7], 8);
let data: Vec<_> = alive_bitset.iter_unset().collect();
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]);
}
@@ -129,14 +137,14 @@ mod tests {
fn test_alive_bitset_iter_small() {
let alive_bitset = AliveBitSet::for_test(&[0, 2, 3, 6], 7);
let data: Vec<_> = alive_bitset.iter_unset().collect();
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, vec![1, 4, 5]);
}
#[test]
fn test_alive_bitset_iter() {
let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000], 1001);
let data: Vec<_> = alive_bitset.iter_unset().collect();
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, (2..=999).collect::<Vec<_>>());
}
}
@@ -166,7 +174,7 @@ mod bench {
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| alive_bitset.iter_unset().collect::<Vec<_>>());
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]
@@ -184,7 +192,7 @@ mod bench {
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000);
bench.iter(|| alive_bitset.iter_unset().collect::<Vec<_>>());
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]

View File

@@ -151,7 +151,7 @@ pub(crate) fn advance_deletes(
let max_doc = segment_reader.max_doc();
let mut alive_bitset: BitSet = match segment_entry.alive_bitset() {
Some(previous_alive_bitset) => (*previous_alive_bitset).clone(),
None => BitSet::with_max_value_and_filled(max_doc),
None => BitSet::with_max_value_and_full(max_doc),
};
let num_deleted_docs_before = segment.meta().num_deleted_docs();
@@ -175,7 +175,7 @@ pub(crate) fn advance_deletes(
}
}
let num_alive_docs: u32 = alive_bitset.num_set_bits() as u32;
let num_alive_docs: u32 = alive_bitset.len() as u32;
let num_deleted_docs = max_doc - num_alive_docs;
if num_deleted_docs > num_deleted_docs_before {
// There are new deletes. We need to write a new delete file.
@@ -259,7 +259,7 @@ fn apply_deletes(
let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps);
let max_doc = segment.meta().max_doc();
let mut deleted_bitset = BitSet::with_max_value_and_filled(max_doc);
let mut deleted_bitset = BitSet::with_max_value_and_full(max_doc);
let may_have_deletes = compute_deleted_bitset(
&mut deleted_bitset,
&segment_reader,

View File

@@ -99,22 +99,21 @@ fn compute_min_max_val(
segment_reader: &SegmentReader,
) -> Option<(u64, u64)> {
if segment_reader.max_doc() == 0 {
None
} else {
if segment_reader.alive_bitset().is_some() {
// some deleted documents,
// we need to recompute the max / min
minmax(
segment_reader
.doc_ids_alive()
.map(|doc_id| u64_reader.get(doc_id)),
)
} else {
// no deleted documents,
// we can use the previous min_val, max_val.
Some((u64_reader.min_value(), u64_reader.max_value()))
}
return None;
}
if segment_reader.alive_bitset().is_none() {
// no deleted documents,
// we can use the previous min_val, max_val.
return Some((u64_reader.min_value(), u64_reader.max_value()));
}
// some deleted documents,
// we need to recompute the max / min
minmax(
segment_reader
.doc_ids_alive()
.map(|doc_id| u64_reader.get(doc_id)),
)
}
struct TermOrdinalMapping {

View File

@@ -90,7 +90,7 @@ impl DocSet for BitSetDocSet {
/// but we don't have access to any better
/// value.
fn size_hint(&self) -> u32 {
self.docs.num_set_bits() as u32
self.docs.len() as u32
}
}
@@ -124,7 +124,7 @@ mod tests {
for i in 0..100_000 {
assert_eq!(btreeset.contains(&i), bitset.contains(i));
}
assert_eq!(btreeset.len(), bitset.num_set_bits());
assert_eq!(btreeset.len(), bitset.len());
let mut bitset_docset = BitSetDocSet::from(bitset);
let mut remaining = true;
for el in btreeset.into_iter() {

View File

@@ -219,18 +219,14 @@ where
}
let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
.iter()
.map(|bitset| bitset.num_set())
.map(|bitset| bitset.len())
.sum::<u32>()
+ 1;
for bitset in self.bitsets.iter_mut() {
bitset.clear();
}
while self.refill() {
count += self
.bitsets
.iter()
.map(|bitset| bitset.num_set())
.sum::<u32>();
count += self.bitsets.iter().map(|bitset| bitset.len()).sum::<u32>();
for bitset in self.bitsets.iter_mut() {
bitset.clear();
}