mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-05 01:50:42 +00:00
move code to biset, add inline, add benchmark
This commit is contained in:
@@ -16,6 +16,7 @@ pub struct TinySetIterator(TinySet);
|
||||
impl Iterator for TinySetIterator {
|
||||
type Item = u32;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.0.pop_lowest()
|
||||
}
|
||||
@@ -34,6 +35,7 @@ impl TinySet {
|
||||
writer.write_all(self.0.to_le_bytes().as_ref())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn deserialize(data: [u8; 8]) -> io::Result<Self> {
|
||||
let val: u64 = u64::from_le_bytes(data);
|
||||
Ok(TinySet(val))
|
||||
@@ -48,21 +50,25 @@ impl TinySet {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the complement of the set in `[0, 64[`.
|
||||
fn complement(self) -> TinySet {
|
||||
TinySet(!self.0)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns true iff the `TinySet` contains the element `el`.
|
||||
pub fn contains(self, el: u32) -> bool {
|
||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the number of elements in the TinySet.
|
||||
pub fn len(self) -> u32 {
|
||||
self.0.count_ones()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the intersection of `self` and `other`
|
||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 & other.0)
|
||||
@@ -146,8 +152,7 @@ impl BitSet {
|
||||
/// Write a `BitSet`
|
||||
///
|
||||
pub fn serialize(&self, writer: &mut dyn Write) -> io::Result<()> {
|
||||
//writer.write_all(self.len.to_le_bytes().as_ref())?;
|
||||
//writer.write_all(self.max_value.to_le_bytes().as_ref())?;
|
||||
writer.write_all(self.max_value.to_le_bytes().as_ref())?;
|
||||
|
||||
for tinyset in self.tinysets.iter() {
|
||||
tinyset.serialize(writer)?;
|
||||
@@ -158,12 +163,9 @@ impl BitSet {
|
||||
|
||||
/// Deserialize a `BitSet`. BitSet is considered immutable after deserialization.
|
||||
///
|
||||
pub fn deserialize(data: &[u8]) -> io::Result<Self> {
|
||||
//let len: u64 = u64::from_le_bytes(data[..8].try_into().unwrap());
|
||||
//data = &data[8..];
|
||||
|
||||
//let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
|
||||
//data = &data[4..];
|
||||
pub fn deserialize(mut data: &[u8]) -> io::Result<Self> {
|
||||
let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
|
||||
data = &data[4..];
|
||||
|
||||
let mut tinysets = vec![];
|
||||
for chunk in data.chunks_exact(8) {
|
||||
@@ -173,21 +175,35 @@ impl BitSet {
|
||||
Ok(BitSet {
|
||||
tinysets: tinysets.into_boxed_slice(),
|
||||
len: 0,
|
||||
max_value: 0,
|
||||
max_value,
|
||||
})
|
||||
}
|
||||
|
||||
/// Iterate over the positions of the set elements
|
||||
/// Iterate the tinyset on the fly from serialized data.
|
||||
///
|
||||
#[inline]
|
||||
pub fn iter_positions_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = u32> + 'a {
|
||||
data.chunks_exact(8)
|
||||
pub fn iter_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = TinySet> + 'a {
|
||||
data[4..].chunks_exact(8).map(move |chunk| {
|
||||
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap();
|
||||
tinyset
|
||||
})
|
||||
}
|
||||
|
||||
/// Iterate over the positions of the unset elements.
|
||||
///
|
||||
/// max_val needs to be provided, since the last 64 bits may
|
||||
#[inline]
|
||||
pub fn iter_unset_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = u32> + 'a {
|
||||
let max_val: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
|
||||
Self::iter_from_bytes(data)
|
||||
.map(|tinyset| tinyset.complement())
|
||||
.enumerate()
|
||||
.filter(|(_, tinyset)| !tinyset.is_empty())
|
||||
.flat_map(|(chunk_num, chunk)| {
|
||||
let tinyset = TinySet::deserialize(chunk.try_into().unwrap()).unwrap();
|
||||
.flat_map(move |(chunk_num, tinyset)| {
|
||||
let chunk_base_val = chunk_num as u32 * 64;
|
||||
tinyset
|
||||
.into_iter()
|
||||
.map(move |val| val + chunk_num as u32 * 64)
|
||||
.map(move |val| val + chunk_base_val)
|
||||
.take_while(move |doc| *doc < max_val)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -227,6 +243,15 @@ impl BitSet {
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
#[inline]
|
||||
pub fn contains_from_bytes(el: u32, data: &[u8]) -> bool {
|
||||
let byte_offset = 4 + el / 8u32;
|
||||
let b: u8 = data[byte_offset as usize];
|
||||
let shift = (el & 7u32) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
self.tinyset(el / 64u32).contains(el % 64)
|
||||
|
||||
@@ -289,7 +289,7 @@ impl SegmentReader {
|
||||
/// Returns an iterator that will iterate over the alive document ids
|
||||
pub fn doc_ids_alive(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
|
||||
if let Some(delete_bitset) = &self.delete_bitset_opt {
|
||||
Box::new(delete_bitset.iter_positions())
|
||||
Box::new(delete_bitset.iter_unset())
|
||||
} else {
|
||||
Box::new(0u32..self.max_doc)
|
||||
}
|
||||
|
||||
@@ -3,8 +3,6 @@ use crate::directory::OwnedBytes;
|
||||
use crate::space_usage::ByteCount;
|
||||
use crate::DocId;
|
||||
use common::BitSet;
|
||||
use common::TinySet;
|
||||
use std::convert::TryInto;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
@@ -13,29 +11,8 @@ use std::io::Write;
|
||||
/// where `delete_bitset` is the set of deleted `DocId`.
|
||||
/// Warning: this function does not call terminate. The caller is in charge of
|
||||
/// closing the writer properly.
|
||||
pub fn write_delete_bitset(
|
||||
delete_bitset: &BitSet,
|
||||
max_doc: u32,
|
||||
writer: &mut dyn Write,
|
||||
) -> io::Result<()> {
|
||||
pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut dyn Write) -> io::Result<()> {
|
||||
delete_bitset.serialize(writer)?;
|
||||
//let mut byte = 0u8;
|
||||
//let mut shift = 0u8;
|
||||
//for doc in 0..max_doc {
|
||||
//if delete_bitset.contains(doc) {
|
||||
//byte |= 1 << shift;
|
||||
//}
|
||||
//if shift == 7 {
|
||||
//writer.write_all(&[byte])?;
|
||||
//shift = 0;
|
||||
//byte = 0;
|
||||
//} else {
|
||||
//shift += 1;
|
||||
//}
|
||||
//}
|
||||
//if max_doc % 8 > 0 {
|
||||
//writer.write_all(&[byte])?;
|
||||
//}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -59,7 +36,7 @@ impl DeleteBitSet {
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("dummydeletebitset");
|
||||
let mut wrt = directory.open_write(path).unwrap();
|
||||
write_delete_bitset(&bitset, max_doc, &mut wrt).unwrap();
|
||||
write_delete_bitset(&bitset, &mut wrt).unwrap();
|
||||
wrt.terminate().unwrap();
|
||||
let file = directory.open_read(path).unwrap();
|
||||
Self::open(file).unwrap()
|
||||
@@ -68,12 +45,8 @@ impl DeleteBitSet {
|
||||
/// Opens a delete bitset given its file.
|
||||
pub fn open(file: FileSlice) -> crate::Result<DeleteBitSet> {
|
||||
let bytes = file.read_bytes()?;
|
||||
let num_deleted = bytes
|
||||
.chunks_exact(8)
|
||||
.map(|chunk| {
|
||||
let tinyset = TinySet::deserialize(chunk.try_into().unwrap()).unwrap();
|
||||
tinyset.len() as usize
|
||||
})
|
||||
let num_deleted = BitSet::iter_from_bytes(bytes.as_slice())
|
||||
.map(|tinyset| tinyset.len() as usize)
|
||||
.sum();
|
||||
|
||||
Ok(DeleteBitSet {
|
||||
@@ -91,17 +64,15 @@ impl DeleteBitSet {
|
||||
/// Returns true iff the document has been marked as deleted.
|
||||
#[inline]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
let byte_offset = doc / 8u32;
|
||||
let b: u8 = self.data.as_slice()[byte_offset as usize];
|
||||
let shift = (doc & 7u32) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
let data = self.data.as_slice();
|
||||
BitSet::contains_from_bytes(doc, data)
|
||||
}
|
||||
|
||||
/// Iterate over the positions of the set elements
|
||||
#[inline]
|
||||
pub fn iter_positions(&self) -> impl Iterator<Item = u32> + '_ {
|
||||
pub fn iter_unset(&self) -> impl Iterator<Item = u32> + '_ {
|
||||
let data = self.data.as_slice();
|
||||
BitSet::iter_positions_from_bytes(data)
|
||||
BitSet::iter_unset_from_bytes(data)
|
||||
}
|
||||
|
||||
/// The number of deleted docs
|
||||
@@ -152,29 +123,82 @@ mod tests {
|
||||
fn test_delete_bitset_iter_minimal() {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[7], 8);
|
||||
|
||||
let data: Vec<_> = delete_bitset.iter_positions().collect();
|
||||
assert_eq!(data, vec![7]);
|
||||
let data: Vec<_> = delete_bitset.iter_unset().collect();
|
||||
assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_bitset_iter_small() {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[0, 2, 3, 6], 7);
|
||||
|
||||
let data: Vec<_> = delete_bitset.iter_positions().collect();
|
||||
assert_eq!(data, vec![0, 2, 3, 6]);
|
||||
let data: Vec<_> = delete_bitset.iter_unset().collect();
|
||||
assert_eq!(data, vec![1, 4, 5]);
|
||||
}
|
||||
#[test]
|
||||
fn test_delete_bitset_iter() {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[1, 2, 3, 5, 10, 64, 65, 66, 100], 110);
|
||||
let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000], 1001);
|
||||
|
||||
let data: Vec<_> = delete_bitset.iter_positions().collect();
|
||||
assert_eq!(data, vec![1, 2, 3, 5, 10, 64, 65, 66, 100]);
|
||||
}
|
||||
#[test]
|
||||
fn test_delete_bitset_iter_empty_blocks() {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[1, 2, 3, 5, 10, 64, 65, 66, 100, 1000], 1010);
|
||||
|
||||
let data: Vec<_> = delete_bitset.iter_positions().collect();
|
||||
assert_eq!(data, vec![1, 2, 3, 5, 10, 64, 65, 66, 100, 1000]);
|
||||
let data: Vec<_> = delete_bitset.iter_unset().collect();
|
||||
assert_eq!(data, (2..=999).collect::<Vec<_>>());
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::DeleteBitSet;
|
||||
use common::BitSet;
|
||||
use rand::prelude::IteratorRandom;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use test::Bencher;
|
||||
|
||||
fn get_many_deleted() -> Vec<u32> {
|
||||
let mut data = (0..1_000_000_u32).collect::<Vec<u32>>();
|
||||
for _ in 0..(1_000_000) * 7 / 8 {
|
||||
remove_rand(&mut data);
|
||||
}
|
||||
data
|
||||
}
|
||||
|
||||
fn remove_rand(raw: &mut Vec<u32>) {
|
||||
let i = (0..raw.len()).choose(&mut thread_rng()).unwrap();
|
||||
raw.remove(i);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| delete_bitset.iter_unset().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_access(bench: &mut Bencher) {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
(0..1_000_000_u32)
|
||||
.filter(|doc| delete_bitset.is_alive(*doc))
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
|
||||
let delete_bitset = DeleteBitSet::for_test(&get_many_deleted(), 1_000_000);
|
||||
|
||||
bench.iter(|| delete_bitset.iter_unset().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) {
|
||||
let delete_bitset = DeleteBitSet::for_test(&get_many_deleted(), 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
(0..1_000_000_u32)
|
||||
.filter(|doc| delete_bitset.is_alive(*doc))
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,7 +180,7 @@ pub(crate) fn advance_deletes(
|
||||
// There are new deletes. We need to write a new delete file.
|
||||
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||
let mut delete_file = segment.open_write(SegmentComponent::Delete)?;
|
||||
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
|
||||
write_delete_bitset(&delete_bitset, &mut delete_file)?;
|
||||
delete_file.terminate()?;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user