From 4da71273e1408ba844f8ce6868ec7ab366086afb Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 17 Sep 2021 10:28:12 +0800 Subject: [PATCH] add de/serialization for bitset remove len footgun --- common/src/bitset.rs | 61 +++++++++++++++++++++++++++++++++++++++-- src/fastfield/delete.rs | 12 ++------ src/indexer/merger.rs | 3 +- 3 files changed, 61 insertions(+), 15 deletions(-) diff --git a/common/src/bitset.rs b/common/src/bitset.rs index 942a94269..c5f741829 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -1,5 +1,7 @@ -use std::fmt; +use std::convert::TryInto; +use std::io::Write; use std::u64; +use std::{fmt, io}; #[derive(Clone, Copy, Eq, PartialEq)] pub struct TinySet(u64); @@ -28,6 +30,15 @@ impl IntoIterator for TinySet { } impl TinySet { + pub fn serialize(&self, writer: &mut dyn Write) -> io::Result<()> { + writer.write_all(self.0.to_le_bytes().as_ref()) + } + + pub fn deserialize(data: &[u8]) -> io::Result { + let val: u64 = u64::from_le_bytes(data[..8].try_into().unwrap()); + Ok(TinySet(val)) + } + /// Returns an empty `TinySet`. pub fn empty() -> TinySet { TinySet(0u64) @@ -123,7 +134,7 @@ impl TinySet { #[derive(Clone)] pub struct BitSet { tinysets: Box<[TinySet]>, - len: usize, + len: u64, max_value: u32, } @@ -132,6 +143,41 @@ fn num_buckets(max_val: u32) -> u32 { } impl BitSet { + /// Write a `BitSet` + /// + pub fn serialize(&mut self, writer: &mut dyn Write) -> io::Result<()> { + writer.write_all(self.len.to_le_bytes().as_ref())?; + writer.write_all(self.max_value.to_le_bytes().as_ref())?; + + for tinyset in self.tinysets.iter() { + tinyset.serialize(writer)?; + } + writer.flush()?; + Ok(()) + } + + /// UnWrite a `BitSet` + /// + pub fn deserialize(&mut self, mut data: &[u8]) -> io::Result { + let len: u64 = u64::from_le_bytes(data[..8].try_into().unwrap()); + data = &data[8..]; + + let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap()); + data = &data[4..]; + + let mut tinysets = vec![]; + while !data.is_empty() { + let tinyset = TinySet::deserialize(data)?; + tinysets.push(tinyset); + data = &data[8..]; + } + Ok(BitSet { + tinysets: tinysets.into_boxed_slice(), + len, + max_value, + }) + } + /// Create a new `BitSet` that may contain elements /// within `[0, max_val[`. pub fn with_max_value(max_value: u32) -> BitSet { @@ -153,7 +199,7 @@ impl BitSet { /// Returns the number of elements in the `BitSet`. pub fn len(&self) -> usize { - self.len + self.len as usize } /// Inserts an element in the `BitSet` @@ -249,6 +295,15 @@ mod tests { assert_eq!(hashset.contains(&el), bitset.contains(el)); } assert_eq!(bitset.max_value(), max_value); + + // test deser + let mut data = vec![]; + bitset.serialize(&mut data).unwrap(); + let bitset = bitset.deserialize(&data).unwrap(); + for el in 0..max_value { + assert_eq!(hashset.contains(&el), bitset.contains(el)); + } + assert_eq!(bitset.max_value(), max_value); }; test_against_hashset(&[], 0); diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index f695f9968..eff577d2b 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -3,7 +3,6 @@ use crate::directory::OwnedBytes; use crate::space_usage::ByteCount; use crate::DocId; use common::BitSet; -use common::HasLen; use std::io; use std::io::Write; @@ -118,17 +117,10 @@ impl DeleteBitSet { } } -impl HasLen for DeleteBitSet { - fn len(&self) -> usize { - self.num_deleted - } -} - #[cfg(test)] mod tests { use super::DeleteBitSet; - use common::HasLen; #[test] fn test_delete_bitset_empty() { @@ -136,7 +128,7 @@ mod tests { for doc in 0..10 { assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc)); } - assert_eq!(delete_bitset.len(), 0); + assert_eq!(delete_bitset.num_deleted(), 0); } #[test] @@ -156,7 +148,7 @@ mod tests { for doc in 0..10 { assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc)); } - assert_eq!(delete_bitset.len(), 2); + assert_eq!(delete_bitset.num_deleted(), 2); } #[test] diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index d603f2900..06d02859a 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -29,7 +29,6 @@ use crate::{ SegmentOrdinal, }; use crate::{DocId, InvertedIndexReader, SegmentComponent}; -use common::HasLen; use itertools::Itertools; use measure_time::debug_time; use std::cmp; @@ -502,7 +501,7 @@ impl IndexMerger { let mut num_docs = 0; for (reader, u64s_reader) in reader_and_field_accessors.iter() { if let Some(delete_bitset) = reader.delete_bitset() { - num_docs += reader.max_doc() as u64 - delete_bitset.len() as u64; + num_docs += reader.max_doc() as u64 - delete_bitset.num_deleted() as u64; for doc in reader.doc_ids_alive() { let num_vals = u64s_reader.get_len(doc) as u64; total_num_vals += num_vals;