add de/serialization for bitset

remove len footgun
This commit is contained in:
Pascal Seitz
2021-09-17 10:28:12 +08:00
parent 4ae1d87632
commit 4da71273e1
3 changed files with 61 additions and 15 deletions

View File

@@ -1,5 +1,7 @@
use std::fmt;
use std::convert::TryInto;
use std::io::Write;
use std::u64;
use std::{fmt, io};
#[derive(Clone, Copy, Eq, PartialEq)]
pub struct TinySet(u64);
@@ -28,6 +30,15 @@ impl IntoIterator for TinySet {
}
impl TinySet {
pub fn serialize(&self, writer: &mut dyn Write) -> io::Result<()> {
writer.write_all(self.0.to_le_bytes().as_ref())
}
pub fn deserialize(data: &[u8]) -> io::Result<Self> {
let val: u64 = u64::from_le_bytes(data[..8].try_into().unwrap());
Ok(TinySet(val))
}
/// Returns an empty `TinySet`.
pub fn empty() -> TinySet {
TinySet(0u64)
@@ -123,7 +134,7 @@ impl TinySet {
#[derive(Clone)]
pub struct BitSet {
tinysets: Box<[TinySet]>,
len: usize,
len: u64,
max_value: u32,
}
@@ -132,6 +143,41 @@ fn num_buckets(max_val: u32) -> u32 {
}
impl BitSet {
/// Write a `BitSet`
///
pub fn serialize(&mut self, writer: &mut dyn Write) -> io::Result<()> {
writer.write_all(self.len.to_le_bytes().as_ref())?;
writer.write_all(self.max_value.to_le_bytes().as_ref())?;
for tinyset in self.tinysets.iter() {
tinyset.serialize(writer)?;
}
writer.flush()?;
Ok(())
}
/// UnWrite a `BitSet`
///
pub fn deserialize(&mut self, mut data: &[u8]) -> io::Result<Self> {
let len: u64 = u64::from_le_bytes(data[..8].try_into().unwrap());
data = &data[8..];
let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
data = &data[4..];
let mut tinysets = vec![];
while !data.is_empty() {
let tinyset = TinySet::deserialize(data)?;
tinysets.push(tinyset);
data = &data[8..];
}
Ok(BitSet {
tinysets: tinysets.into_boxed_slice(),
len,
max_value,
})
}
/// Create a new `BitSet` that may contain elements
/// within `[0, max_val[`.
pub fn with_max_value(max_value: u32) -> BitSet {
@@ -153,7 +199,7 @@ impl BitSet {
/// Returns the number of elements in the `BitSet`.
pub fn len(&self) -> usize {
self.len
self.len as usize
}
/// Inserts an element in the `BitSet`
@@ -249,6 +295,15 @@ mod tests {
assert_eq!(hashset.contains(&el), bitset.contains(el));
}
assert_eq!(bitset.max_value(), max_value);
// test deser
let mut data = vec![];
bitset.serialize(&mut data).unwrap();
let bitset = bitset.deserialize(&data).unwrap();
for el in 0..max_value {
assert_eq!(hashset.contains(&el), bitset.contains(el));
}
assert_eq!(bitset.max_value(), max_value);
};
test_against_hashset(&[], 0);

View File

@@ -3,7 +3,6 @@ use crate::directory::OwnedBytes;
use crate::space_usage::ByteCount;
use crate::DocId;
use common::BitSet;
use common::HasLen;
use std::io;
use std::io::Write;
@@ -118,17 +117,10 @@ impl DeleteBitSet {
}
}
impl HasLen for DeleteBitSet {
fn len(&self) -> usize {
self.num_deleted
}
}
#[cfg(test)]
mod tests {
use super::DeleteBitSet;
use common::HasLen;
#[test]
fn test_delete_bitset_empty() {
@@ -136,7 +128,7 @@ mod tests {
for doc in 0..10 {
assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc));
}
assert_eq!(delete_bitset.len(), 0);
assert_eq!(delete_bitset.num_deleted(), 0);
}
#[test]
@@ -156,7 +148,7 @@ mod tests {
for doc in 0..10 {
assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc));
}
assert_eq!(delete_bitset.len(), 2);
assert_eq!(delete_bitset.num_deleted(), 2);
}
#[test]

View File

@@ -29,7 +29,6 @@ use crate::{
SegmentOrdinal,
};
use crate::{DocId, InvertedIndexReader, SegmentComponent};
use common::HasLen;
use itertools::Itertools;
use measure_time::debug_time;
use std::cmp;
@@ -502,7 +501,7 @@ impl IndexMerger {
let mut num_docs = 0;
for (reader, u64s_reader) in reader_and_field_accessors.iter() {
if let Some(delete_bitset) = reader.delete_bitset() {
num_docs += reader.max_doc() as u64 - delete_bitset.len() as u64;
num_docs += reader.max_doc() as u64 - delete_bitset.num_deleted() as u64;
for doc in reader.doc_ids_alive() {
let num_vals = u64s_reader.get_len(doc) as u64;
total_num_vals += num_vals;