mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 16:22:55 +00:00
AliveBitSet instead of DeleteBitSet
This commit is contained in:
@@ -31,7 +31,7 @@ impl IntoIterator for TinySet {
|
||||
}
|
||||
|
||||
impl TinySet {
|
||||
pub fn serialize(&self, writer: &mut dyn Write) -> io::Result<()> {
|
||||
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
|
||||
writer.write_all(self.0.to_le_bytes().as_ref())
|
||||
}
|
||||
|
||||
@@ -42,17 +42,24 @@ impl TinySet {
|
||||
}
|
||||
|
||||
/// Returns an empty `TinySet`.
|
||||
#[inline]
|
||||
pub fn empty() -> TinySet {
|
||||
TinySet(0u64)
|
||||
}
|
||||
|
||||
/// Returns a full `TinySet`.
|
||||
#[inline]
|
||||
pub fn full() -> TinySet {
|
||||
TinySet::empty().complement()
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the complement of the set in `[0, 64[`.
|
||||
fn complement(self) -> TinySet {
|
||||
pub fn complement(self) -> TinySet {
|
||||
TinySet(!self.0)
|
||||
}
|
||||
|
||||
@@ -68,6 +75,12 @@ impl TinySet {
|
||||
self.0.count_ones()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the number of elements in the TinySet.
|
||||
pub fn num_unset(self) -> u32 {
|
||||
self.0.count_zeros()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the intersection of `self` and `other`
|
||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
||||
@@ -81,13 +94,21 @@ impl TinySet {
|
||||
TinySet(1u64 << u64::from(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
/// Insert a new element within [0..64)
|
||||
#[inline]
|
||||
pub fn insert(self, el: u32) -> TinySet {
|
||||
self.union(TinySet::singleton(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
/// Removes an element within [0..64)
|
||||
#[inline]
|
||||
pub fn remove(self, el: u32) -> TinySet {
|
||||
self.intersect(TinySet::singleton(el).complement())
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64)
|
||||
///
|
||||
/// returns true if the bit changed
|
||||
#[inline]
|
||||
pub fn insert_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
@@ -95,6 +116,16 @@ impl TinySet {
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Remove a new element within [0..64)
|
||||
///
|
||||
/// returns true if the bit changed
|
||||
#[inline]
|
||||
pub fn remove_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.remove(el);
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Returns the union of two tinysets
|
||||
#[inline]
|
||||
pub fn union(self, other: TinySet) -> TinySet {
|
||||
@@ -151,7 +182,7 @@ fn num_buckets(max_val: u32) -> u32 {
|
||||
impl BitSet {
|
||||
/// serialize a `BitSet`.
|
||||
///
|
||||
pub fn serialize(&self, writer: &mut dyn Write) -> io::Result<()> {
|
||||
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
|
||||
writer.write_all(self.max_value.to_le_bytes().as_ref())?;
|
||||
|
||||
for tinyset in self.tinysets.iter() {
|
||||
@@ -163,6 +194,7 @@ impl BitSet {
|
||||
|
||||
/// Deserialize a `BitSet`.
|
||||
///
|
||||
#[cfg(test)]
|
||||
pub fn deserialize(mut data: &[u8]) -> io::Result<Self> {
|
||||
let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
|
||||
data = &data[4..];
|
||||
@@ -181,10 +213,19 @@ impl BitSet {
|
||||
})
|
||||
}
|
||||
|
||||
/// Count the number of unset bits from serialized data.
|
||||
///
|
||||
#[inline]
|
||||
pub fn count_unset_from_bytes<'a>(data: &'a [u8]) -> usize {
|
||||
BitSet::iter_tinysets_from_bytes(data)
|
||||
.map(|tinyset| tinyset.num_unset() as usize)
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// Iterate the tinyset on the fly from serialized data.
|
||||
///
|
||||
#[inline]
|
||||
pub fn iter_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = TinySet> + 'a {
|
||||
fn iter_tinysets_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = TinySet> + 'a {
|
||||
assert!((data.len() - 4) % 8 == 0);
|
||||
data[4..].chunks_exact(8).map(move |chunk| {
|
||||
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap();
|
||||
@@ -198,8 +239,7 @@ impl BitSet {
|
||||
#[inline]
|
||||
pub fn iter_unset_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = u32> + 'a {
|
||||
let max_val: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
|
||||
Self::iter_from_bytes(data)
|
||||
.map(|tinyset| tinyset.complement())
|
||||
Self::iter_tinysets_from_bytes(data)
|
||||
.enumerate()
|
||||
.flat_map(move |(chunk_num, tinyset)| {
|
||||
let chunk_base_val = chunk_num as u32 * 64;
|
||||
@@ -211,7 +251,7 @@ impl BitSet {
|
||||
}
|
||||
|
||||
/// Create a new `BitSet` that may contain elements
|
||||
/// within `[0, max_val[`.
|
||||
/// within `[0, max_val)`.
|
||||
pub fn with_max_value(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
|
||||
@@ -222,6 +262,18 @@ impl BitSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `BitSet` that may contain elements
|
||||
/// within `[0, max_val)`.
|
||||
pub fn with_max_value_and_filled(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let tinybisets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice();
|
||||
BitSet {
|
||||
tinysets: tinybisets,
|
||||
len: max_value as u64,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes all elements from the `BitSet`.
|
||||
pub fn clear(&mut self) {
|
||||
for tinyset in self.tinysets.iter_mut() {
|
||||
@@ -230,7 +282,7 @@ impl BitSet {
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the `BitSet`.
|
||||
pub fn len(&self) -> usize {
|
||||
pub fn num_set_bits(&self) -> usize {
|
||||
self.len as usize
|
||||
}
|
||||
|
||||
@@ -246,6 +298,18 @@ impl BitSet {
|
||||
};
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
pub fn remove(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.len -= if self.tinysets[higher as usize].remove_mut(lower) {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
#[inline]
|
||||
pub fn contains_from_bytes(el: u32, data: &[u8]) -> bool {
|
||||
@@ -296,6 +360,33 @@ mod tests {
|
||||
use std::collections::HashSet;
|
||||
use std::convert::TryInto;
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set_remove() {
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32).insert(5).remove(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(5u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty()
|
||||
.insert(63u32)
|
||||
.insert(1)
|
||||
.insert(5)
|
||||
.remove(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert_eq!(u.pop_lowest(), Some(5u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1).remove(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1).remove(1u32);
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_tiny_set() {
|
||||
assert!(TinySet::empty().is_empty());
|
||||
@@ -361,7 +452,7 @@ mod tests {
|
||||
assert_eq!(hashset.contains(&el), bitset.contains(el));
|
||||
}
|
||||
assert_eq!(bitset.max_value(), max_value);
|
||||
assert_eq!(bitset.len(), els.len());
|
||||
assert_eq!(bitset.num_set_bits(), els.len());
|
||||
};
|
||||
|
||||
test_against_hashset(&[], 0);
|
||||
@@ -415,17 +506,25 @@ mod tests {
|
||||
#[test]
|
||||
fn test_bitset_len() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
assert_eq!(bitset.num_set_bits(), 0);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
assert_eq!(bitset.num_set_bits(), 1);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
assert_eq!(bitset.num_set_bits(), 2);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
assert_eq!(bitset.num_set_bits(), 2);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
assert_eq!(bitset.num_set_bits(), 2);
|
||||
bitset.insert(104u32);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
assert_eq!(bitset.num_set_bits(), 3);
|
||||
bitset.remove(105u32);
|
||||
assert_eq!(bitset.num_set_bits(), 3);
|
||||
bitset.remove(104u32);
|
||||
assert_eq!(bitset.num_set_bits(), 2);
|
||||
bitset.remove(3u32);
|
||||
assert_eq!(bitset.num_set_bits(), 1);
|
||||
bitset.remove(103u32);
|
||||
assert_eq!(bitset.num_set_bits(), 0);
|
||||
}
|
||||
|
||||
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||
|
||||
Reference in New Issue
Block a user