mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 09:12:55 +00:00
AliveBitSet instead of DeleteBitSet
This commit is contained in:
@@ -31,7 +31,7 @@ impl IntoIterator for TinySet {
|
||||
}
|
||||
|
||||
impl TinySet {
|
||||
pub fn serialize(&self, writer: &mut dyn Write) -> io::Result<()> {
|
||||
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
|
||||
writer.write_all(self.0.to_le_bytes().as_ref())
|
||||
}
|
||||
|
||||
@@ -42,17 +42,24 @@ impl TinySet {
|
||||
}
|
||||
|
||||
/// Returns an empty `TinySet`.
|
||||
#[inline]
|
||||
pub fn empty() -> TinySet {
|
||||
TinySet(0u64)
|
||||
}
|
||||
|
||||
/// Returns a full `TinySet`.
|
||||
#[inline]
|
||||
pub fn full() -> TinySet {
|
||||
TinySet::empty().complement()
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the complement of the set in `[0, 64[`.
|
||||
fn complement(self) -> TinySet {
|
||||
pub fn complement(self) -> TinySet {
|
||||
TinySet(!self.0)
|
||||
}
|
||||
|
||||
@@ -68,6 +75,12 @@ impl TinySet {
|
||||
self.0.count_ones()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the number of elements in the TinySet.
|
||||
pub fn num_unset(self) -> u32 {
|
||||
self.0.count_zeros()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the intersection of `self` and `other`
|
||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
||||
@@ -81,13 +94,21 @@ impl TinySet {
|
||||
TinySet(1u64 << u64::from(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
/// Insert a new element within [0..64)
|
||||
#[inline]
|
||||
pub fn insert(self, el: u32) -> TinySet {
|
||||
self.union(TinySet::singleton(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
/// Removes an element within [0..64)
|
||||
#[inline]
|
||||
pub fn remove(self, el: u32) -> TinySet {
|
||||
self.intersect(TinySet::singleton(el).complement())
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64)
|
||||
///
|
||||
/// returns true if the bit changed
|
||||
#[inline]
|
||||
pub fn insert_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
@@ -95,6 +116,16 @@ impl TinySet {
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Remove a new element within [0..64)
|
||||
///
|
||||
/// returns true if the bit changed
|
||||
#[inline]
|
||||
pub fn remove_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.remove(el);
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Returns the union of two tinysets
|
||||
#[inline]
|
||||
pub fn union(self, other: TinySet) -> TinySet {
|
||||
@@ -151,7 +182,7 @@ fn num_buckets(max_val: u32) -> u32 {
|
||||
impl BitSet {
|
||||
/// serialize a `BitSet`.
|
||||
///
|
||||
pub fn serialize(&self, writer: &mut dyn Write) -> io::Result<()> {
|
||||
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
|
||||
writer.write_all(self.max_value.to_le_bytes().as_ref())?;
|
||||
|
||||
for tinyset in self.tinysets.iter() {
|
||||
@@ -163,6 +194,7 @@ impl BitSet {
|
||||
|
||||
/// Deserialize a `BitSet`.
|
||||
///
|
||||
#[cfg(test)]
|
||||
pub fn deserialize(mut data: &[u8]) -> io::Result<Self> {
|
||||
let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
|
||||
data = &data[4..];
|
||||
@@ -181,10 +213,19 @@ impl BitSet {
|
||||
})
|
||||
}
|
||||
|
||||
/// Count the number of unset bits from serialized data.
|
||||
///
|
||||
#[inline]
|
||||
pub fn count_unset_from_bytes<'a>(data: &'a [u8]) -> usize {
|
||||
BitSet::iter_tinysets_from_bytes(data)
|
||||
.map(|tinyset| tinyset.num_unset() as usize)
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// Iterate the tinyset on the fly from serialized data.
|
||||
///
|
||||
#[inline]
|
||||
pub fn iter_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = TinySet> + 'a {
|
||||
fn iter_tinysets_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = TinySet> + 'a {
|
||||
assert!((data.len() - 4) % 8 == 0);
|
||||
data[4..].chunks_exact(8).map(move |chunk| {
|
||||
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap();
|
||||
@@ -198,8 +239,7 @@ impl BitSet {
|
||||
#[inline]
|
||||
pub fn iter_unset_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = u32> + 'a {
|
||||
let max_val: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
|
||||
Self::iter_from_bytes(data)
|
||||
.map(|tinyset| tinyset.complement())
|
||||
Self::iter_tinysets_from_bytes(data)
|
||||
.enumerate()
|
||||
.flat_map(move |(chunk_num, tinyset)| {
|
||||
let chunk_base_val = chunk_num as u32 * 64;
|
||||
@@ -211,7 +251,7 @@ impl BitSet {
|
||||
}
|
||||
|
||||
/// Create a new `BitSet` that may contain elements
|
||||
/// within `[0, max_val[`.
|
||||
/// within `[0, max_val)`.
|
||||
pub fn with_max_value(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
|
||||
@@ -222,6 +262,18 @@ impl BitSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `BitSet` that may contain elements
|
||||
/// within `[0, max_val)`.
|
||||
pub fn with_max_value_and_filled(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let tinybisets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice();
|
||||
BitSet {
|
||||
tinysets: tinybisets,
|
||||
len: max_value as u64,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes all elements from the `BitSet`.
|
||||
pub fn clear(&mut self) {
|
||||
for tinyset in self.tinysets.iter_mut() {
|
||||
@@ -230,7 +282,7 @@ impl BitSet {
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the `BitSet`.
|
||||
pub fn len(&self) -> usize {
|
||||
pub fn num_set_bits(&self) -> usize {
|
||||
self.len as usize
|
||||
}
|
||||
|
||||
@@ -246,6 +298,18 @@ impl BitSet {
|
||||
};
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
pub fn remove(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.len -= if self.tinysets[higher as usize].remove_mut(lower) {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
#[inline]
|
||||
pub fn contains_from_bytes(el: u32, data: &[u8]) -> bool {
|
||||
@@ -296,6 +360,33 @@ mod tests {
|
||||
use std::collections::HashSet;
|
||||
use std::convert::TryInto;
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set_remove() {
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32).insert(5).remove(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(5u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty()
|
||||
.insert(63u32)
|
||||
.insert(1)
|
||||
.insert(5)
|
||||
.remove(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert_eq!(u.pop_lowest(), Some(5u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1).remove(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1).remove(1u32);
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_tiny_set() {
|
||||
assert!(TinySet::empty().is_empty());
|
||||
@@ -361,7 +452,7 @@ mod tests {
|
||||
assert_eq!(hashset.contains(&el), bitset.contains(el));
|
||||
}
|
||||
assert_eq!(bitset.max_value(), max_value);
|
||||
assert_eq!(bitset.len(), els.len());
|
||||
assert_eq!(bitset.num_set_bits(), els.len());
|
||||
};
|
||||
|
||||
test_against_hashset(&[], 0);
|
||||
@@ -415,17 +506,25 @@ mod tests {
|
||||
#[test]
|
||||
fn test_bitset_len() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
assert_eq!(bitset.num_set_bits(), 0);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
assert_eq!(bitset.num_set_bits(), 1);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
assert_eq!(bitset.num_set_bits(), 2);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
assert_eq!(bitset.num_set_bits(), 2);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
assert_eq!(bitset.num_set_bits(), 2);
|
||||
bitset.insert(104u32);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
assert_eq!(bitset.num_set_bits(), 3);
|
||||
bitset.remove(105u32);
|
||||
assert_eq!(bitset.num_set_bits(), 3);
|
||||
bitset.remove(104u32);
|
||||
assert_eq!(bitset.num_set_bits(), 2);
|
||||
bitset.remove(3u32);
|
||||
assert_eq!(bitset.num_set_bits(), 1);
|
||||
bitset.remove(103u32);
|
||||
assert_eq!(bitset.num_set_bits(), 0);
|
||||
}
|
||||
|
||||
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::core::SegmentId;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::fastfield::FacetReader;
|
||||
use crate::fastfield::FastFieldReaders;
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
|
||||
@@ -47,7 +47,7 @@ pub struct SegmentReader {
|
||||
fieldnorm_readers: FieldNormReaders,
|
||||
|
||||
store_file: FileSlice,
|
||||
delete_bitset_opt: Option<DeleteBitSet>,
|
||||
delete_bitset_opt: Option<AliveBitSet>,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
@@ -172,7 +172,7 @@ impl SegmentReader {
|
||||
|
||||
let delete_bitset_opt = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::Delete)?;
|
||||
let delete_bitset = DeleteBitSet::open(delete_data)?;
|
||||
let delete_bitset = AliveBitSet::open(delete_data)?;
|
||||
Some(delete_bitset)
|
||||
} else {
|
||||
None
|
||||
@@ -274,7 +274,7 @@ impl SegmentReader {
|
||||
|
||||
/// Returns the bitset representing
|
||||
/// the documents that have been deleted.
|
||||
pub fn delete_bitset(&self) -> Option<&DeleteBitSet> {
|
||||
pub fn delete_bitset(&self) -> Option<&AliveBitSet> {
|
||||
self.delete_bitset_opt.as_ref()
|
||||
}
|
||||
|
||||
@@ -307,7 +307,7 @@ impl SegmentReader {
|
||||
self.get_store_reader()?.space_usage(),
|
||||
self.delete_bitset_opt
|
||||
.as_ref()
|
||||
.map(DeleteBitSet::space_usage)
|
||||
.map(AliveBitSet::space_usage)
|
||||
.unwrap_or(0),
|
||||
))
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::DocId;
|
||||
use std::borrow::Borrow;
|
||||
use std::borrow::BorrowMut;
|
||||
@@ -85,7 +85,7 @@ pub trait DocSet: Send {
|
||||
|
||||
/// Returns the number documents matching.
|
||||
/// Calling this method consumes the `DocSet`.
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
fn count(&mut self, delete_bitset: &AliveBitSet) -> u32 {
|
||||
let mut count = 0u32;
|
||||
let mut doc = self.doc();
|
||||
while doc != TERMINATED {
|
||||
@@ -130,7 +130,7 @@ impl<'a> DocSet for &'a mut dyn DocSet {
|
||||
(**self).size_hint()
|
||||
}
|
||||
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
fn count(&mut self, delete_bitset: &AliveBitSet) -> u32 {
|
||||
(**self).count(delete_bitset)
|
||||
}
|
||||
|
||||
@@ -160,7 +160,7 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
unboxed.size_hint()
|
||||
}
|
||||
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
fn count(&mut self, delete_bitset: &AliveBitSet) -> u32 {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.count(delete_bitset)
|
||||
}
|
||||
|
||||
@@ -11,27 +11,27 @@ use std::io::Write;
|
||||
/// where `delete_bitset` is the set of deleted `DocId`.
|
||||
/// Warning: this function does not call terminate. The caller is in charge of
|
||||
/// closing the writer properly.
|
||||
pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut dyn Write) -> io::Result<()> {
|
||||
pub fn write_delete_bitset<T: Write>(delete_bitset: &BitSet, writer: &mut T) -> io::Result<()> {
|
||||
delete_bitset.serialize(writer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set of deleted `DocId`s.
|
||||
#[derive(Clone)]
|
||||
pub struct DeleteBitSet {
|
||||
pub struct AliveBitSet {
|
||||
data: OwnedBytes,
|
||||
num_deleted: usize,
|
||||
}
|
||||
|
||||
impl DeleteBitSet {
|
||||
impl AliveBitSet {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn for_test(docs: &[DocId], max_doc: u32) -> DeleteBitSet {
|
||||
pub(crate) fn for_test(not_alive_docs: &[DocId], max_doc: u32) -> AliveBitSet {
|
||||
use crate::directory::{Directory, RamDirectory, TerminatingWrite};
|
||||
use std::path::Path;
|
||||
assert!(docs.iter().all(|&doc| doc < max_doc));
|
||||
let mut bitset = BitSet::with_max_value(max_doc);
|
||||
for &doc in docs {
|
||||
bitset.insert(doc);
|
||||
assert!(not_alive_docs.iter().all(|&doc| doc < max_doc));
|
||||
let mut bitset = BitSet::with_max_value_and_filled(max_doc);
|
||||
for &doc in not_alive_docs {
|
||||
bitset.remove(doc);
|
||||
}
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("dummydeletebitset");
|
||||
@@ -43,13 +43,11 @@ impl DeleteBitSet {
|
||||
}
|
||||
|
||||
/// Opens a delete bitset given its file.
|
||||
pub fn open(file: FileSlice) -> crate::Result<DeleteBitSet> {
|
||||
pub fn open(file: FileSlice) -> crate::Result<AliveBitSet> {
|
||||
let bytes = file.read_bytes()?;
|
||||
let num_deleted = BitSet::iter_from_bytes(bytes.as_slice())
|
||||
.map(|tinyset| tinyset.len() as usize)
|
||||
.sum();
|
||||
let num_deleted = BitSet::count_unset_from_bytes(bytes.as_slice());
|
||||
|
||||
Ok(DeleteBitSet {
|
||||
Ok(AliveBitSet {
|
||||
data: bytes,
|
||||
num_deleted,
|
||||
})
|
||||
@@ -65,7 +63,7 @@ impl DeleteBitSet {
|
||||
#[inline]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
let data = self.data.as_slice();
|
||||
BitSet::contains_from_bytes(doc, data)
|
||||
!BitSet::contains_from_bytes(doc, data)
|
||||
}
|
||||
|
||||
/// Iterate over the positions of the set elements
|
||||
@@ -88,11 +86,11 @@ impl DeleteBitSet {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::DeleteBitSet;
|
||||
use super::AliveBitSet;
|
||||
|
||||
#[test]
|
||||
fn test_delete_bitset_empty() {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[], 10);
|
||||
let delete_bitset = AliveBitSet::for_test(&[], 10);
|
||||
for doc in 0..10 {
|
||||
assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc));
|
||||
}
|
||||
@@ -101,7 +99,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_delete_bitset() {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[1, 9], 10);
|
||||
let delete_bitset = AliveBitSet::for_test(&[1, 9], 10);
|
||||
assert!(delete_bitset.is_alive(0));
|
||||
assert!(delete_bitset.is_deleted(1));
|
||||
assert!(delete_bitset.is_alive(2));
|
||||
@@ -121,7 +119,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_delete_bitset_iter_minimal() {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[7], 8);
|
||||
let delete_bitset = AliveBitSet::for_test(&[7], 8);
|
||||
|
||||
let data: Vec<_> = delete_bitset.iter_unset().collect();
|
||||
assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]);
|
||||
@@ -129,14 +127,14 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_delete_bitset_iter_small() {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[0, 2, 3, 6], 7);
|
||||
let delete_bitset = AliveBitSet::for_test(&[0, 2, 3, 6], 7);
|
||||
|
||||
let data: Vec<_> = delete_bitset.iter_unset().collect();
|
||||
assert_eq!(data, vec![1, 4, 5]);
|
||||
}
|
||||
#[test]
|
||||
fn test_delete_bitset_iter() {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000], 1001);
|
||||
let delete_bitset = AliveBitSet::for_test(&[0, 1, 1000], 1001);
|
||||
|
||||
let data: Vec<_> = delete_bitset.iter_unset().collect();
|
||||
assert_eq!(data, (2..=999).collect::<Vec<_>>());
|
||||
@@ -146,16 +144,14 @@ mod tests {
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::DeleteBitSet;
|
||||
use common::BitSet;
|
||||
use super::AliveBitSet;
|
||||
use rand::prelude::IteratorRandom;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use test::Bencher;
|
||||
|
||||
fn get_many_deleted() -> Vec<u32> {
|
||||
fn get_alive() -> Vec<u32> {
|
||||
let mut data = (0..1_000_000_u32).collect::<Vec<u32>>();
|
||||
for _ in 0..(1_000_000) * 7 / 8 {
|
||||
for _ in 0..(1_000_000) * 1 / 8 {
|
||||
remove_rand(&mut data);
|
||||
}
|
||||
data
|
||||
@@ -168,14 +164,14 @@ mod bench {
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000);
|
||||
let delete_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| delete_bitset.iter_unset().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_access(bench: &mut Bencher) {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000);
|
||||
let delete_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
(0..1_000_000_u32)
|
||||
@@ -186,14 +182,14 @@ mod bench {
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
|
||||
let delete_bitset = DeleteBitSet::for_test(&get_many_deleted(), 1_000_000);
|
||||
let delete_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000);
|
||||
|
||||
bench.iter(|| delete_bitset.iter_unset().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) {
|
||||
let delete_bitset = DeleteBitSet::for_test(&get_many_deleted(), 1_000_000);
|
||||
let delete_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
(0..1_000_000_u32)
|
||||
@@ -23,9 +23,9 @@ values stored.
|
||||
Read access performance is comparable to that of an array lookup.
|
||||
*/
|
||||
|
||||
pub use self::alive_bitset::write_delete_bitset;
|
||||
pub use self::alive_bitset::AliveBitSet;
|
||||
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
|
||||
pub use self::delete::write_delete_bitset;
|
||||
pub use self::delete::DeleteBitSet;
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
|
||||
@@ -46,8 +46,8 @@ use crate::{
|
||||
schema::Type,
|
||||
};
|
||||
|
||||
mod alive_bitset;
|
||||
mod bytes;
|
||||
mod delete;
|
||||
mod error;
|
||||
mod facet_reader;
|
||||
mod multivalued;
|
||||
|
||||
@@ -114,7 +114,7 @@ fn compute_deleted_bitset(
|
||||
let mut doc_matching_deleted_term = docset.doc();
|
||||
while doc_matching_deleted_term != TERMINATED {
|
||||
if doc_opstamps.is_deleted(doc_matching_deleted_term, delete_op.opstamp) {
|
||||
delete_bitset.insert(doc_matching_deleted_term);
|
||||
delete_bitset.remove(doc_matching_deleted_term);
|
||||
might_have_changed = true;
|
||||
}
|
||||
doc_matching_deleted_term = docset.advance();
|
||||
@@ -151,7 +151,7 @@ pub(crate) fn advance_deletes(
|
||||
let max_doc = segment_reader.max_doc();
|
||||
let mut delete_bitset: BitSet = match segment_entry.delete_bitset() {
|
||||
Some(previous_delete_bitset) => (*previous_delete_bitset).clone(),
|
||||
None => BitSet::with_max_value(max_doc),
|
||||
None => BitSet::with_max_value_and_filled(max_doc),
|
||||
};
|
||||
|
||||
let num_deleted_docs_before = segment.meta().num_deleted_docs();
|
||||
@@ -170,12 +170,13 @@ pub(crate) fn advance_deletes(
|
||||
if let Some(seg_delete_bitset) = segment_reader.delete_bitset() {
|
||||
for doc in 0u32..max_doc {
|
||||
if seg_delete_bitset.is_deleted(doc) {
|
||||
delete_bitset.insert(doc);
|
||||
delete_bitset.remove(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let num_deleted_docs: u32 = delete_bitset.len() as u32;
|
||||
let num_alive_docs: u32 = delete_bitset.num_set_bits() as u32;
|
||||
let num_deleted_docs = max_doc - num_alive_docs;
|
||||
if num_deleted_docs > num_deleted_docs_before {
|
||||
// There are new deletes. We need to write a new delete file.
|
||||
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||
@@ -259,7 +260,7 @@ fn apply_deletes(
|
||||
let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps);
|
||||
|
||||
let max_doc = segment.meta().max_doc();
|
||||
let mut deleted_bitset = BitSet::with_max_value(max_doc);
|
||||
let mut deleted_bitset = BitSet::with_max_value_and_filled(max_doc);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::fastfield::FastFieldDataAccess;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::fastfield::{DeleteBitSet, FastFieldReader};
|
||||
use crate::fastfield::{AliveBitSet, FastFieldReader};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::{
|
||||
collector::TopDocs,
|
||||
@@ -257,7 +257,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(postings.doc_freq(), 2);
|
||||
let fallback_bitset = DeleteBitSet::for_test(&[0], 100);
|
||||
let fallback_bitset = AliveBitSet::for_test(&[0], 100);
|
||||
assert_eq!(
|
||||
postings.doc_freq_given_deletes(
|
||||
segment_reader.delete_bitset().unwrap_or(&fallback_bitset)
|
||||
@@ -336,7 +336,7 @@ mod tests {
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(postings.doc_freq(), 2);
|
||||
let fallback_bitset = DeleteBitSet::for_test(&[0], 100);
|
||||
let fallback_bitset = AliveBitSet::for_test(&[0], 100);
|
||||
assert_eq!(
|
||||
postings.doc_freq_given_deletes(
|
||||
segment_reader.delete_bitset().unwrap_or(&fallback_bitset)
|
||||
@@ -446,7 +446,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(postings.doc_freq(), 2);
|
||||
let fallback_bitset = DeleteBitSet::for_test(&[0], 100);
|
||||
let fallback_bitset = AliveBitSet::for_test(&[0], 100);
|
||||
assert_eq!(
|
||||
postings.doc_freq_given_deletes(
|
||||
segment_reader.delete_bitset().unwrap_or(&fallback_bitset)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::docset::DocSet;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::positions::PositionReader;
|
||||
use crate::postings::branchless_binary_search;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
@@ -34,7 +34,7 @@ impl SegmentPostings {
|
||||
///
|
||||
/// This method will clone and scan through the posting lists.
|
||||
/// (this is a rather expensive operation).
|
||||
pub fn doc_freq_given_deletes(&self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
pub fn doc_freq_given_deletes(&self, delete_bitset: &AliveBitSet) -> u32 {
|
||||
let mut docset = self.clone();
|
||||
let mut doc_freq = 0;
|
||||
loop {
|
||||
@@ -268,7 +268,7 @@ mod tests {
|
||||
use common::HasLen;
|
||||
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::postings::postings::Postings;
|
||||
|
||||
#[test]
|
||||
@@ -296,9 +296,9 @@ mod tests {
|
||||
fn test_doc_freq() {
|
||||
let docs = SegmentPostings::create_from_docs(&[0, 2, 10]);
|
||||
assert_eq!(docs.doc_freq(), 3);
|
||||
let delete_bitset = DeleteBitSet::for_test(&[2], 12);
|
||||
let delete_bitset = AliveBitSet::for_test(&[2], 12);
|
||||
assert_eq!(docs.doc_freq_given_deletes(&delete_bitset), 2);
|
||||
let all_deleted = DeleteBitSet::for_test(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
|
||||
let all_deleted = AliveBitSet::for_test(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
|
||||
assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,7 +90,7 @@ impl DocSet for BitSetDocSet {
|
||||
/// but we don't have access to any better
|
||||
/// value.
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.docs.len() as u32
|
||||
self.docs.num_set_bits() as u32
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,7 +124,7 @@ mod tests {
|
||||
for i in 0..100_000 {
|
||||
assert_eq!(btreeset.contains(&i), bitset.contains(i));
|
||||
}
|
||||
assert_eq!(btreeset.len(), bitset.len());
|
||||
assert_eq!(btreeset.len(), bitset.num_set_bits());
|
||||
let mut bitset_docset = BitSetDocSet::from(bitset);
|
||||
let mut remaining = true;
|
||||
for el in btreeset.into_iter() {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::{Explanation, Query, Scorer, Weight};
|
||||
use crate::{DocId, DocSet, Score, Searcher, SegmentReader, Term};
|
||||
@@ -118,7 +118,7 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
|
||||
self.underlying.size_hint()
|
||||
}
|
||||
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
fn count(&mut self, delete_bitset: &AliveBitSet) -> u32 {
|
||||
self.underlying.count(delete_bitset)
|
||||
}
|
||||
|
||||
|
||||
@@ -57,7 +57,7 @@ pub mod tests {
|
||||
use futures::executor::block_on;
|
||||
|
||||
use super::*;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::schema::{self, FieldValue, TextFieldIndexing, STORED, TEXT};
|
||||
use crate::schema::{Document, TextOptions};
|
||||
use crate::{
|
||||
@@ -113,7 +113,7 @@ pub mod tests {
|
||||
fn test_doc_store_iter_with_delete_bug_1077() -> crate::Result<()> {
|
||||
// this will cover deletion of the first element in a checkpoint
|
||||
let deleted_docids = (200..300).collect::<Vec<_>>();
|
||||
let delete_bitset = DeleteBitSet::for_test(&deleted_docids, NUM_DOCS as u32);
|
||||
let delete_bitset = AliveBitSet::for_test(&deleted_docids, NUM_DOCS as u32);
|
||||
|
||||
let path = Path::new("store");
|
||||
let directory = RamDirectory::create();
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::schema::Document;
|
||||
use crate::space_usage::StoreSpaceUsage;
|
||||
use crate::store::index::Checkpoint;
|
||||
use crate::DocId;
|
||||
use crate::{error::DataCorruption, fastfield::DeleteBitSet};
|
||||
use crate::{error::DataCorruption, fastfield::AliveBitSet};
|
||||
use common::{BinarySerializable, HasLen, VInt};
|
||||
use lru::LruCache;
|
||||
use std::io;
|
||||
@@ -136,7 +136,7 @@ impl StoreReader {
|
||||
/// The delete_bitset has to be forwarded from the `SegmentReader` or the results maybe wrong.
|
||||
pub fn iter<'a: 'b, 'b>(
|
||||
&'b self,
|
||||
delete_bitset: Option<&'a DeleteBitSet>,
|
||||
delete_bitset: Option<&'a AliveBitSet>,
|
||||
) -> impl Iterator<Item = crate::Result<Document>> + 'b {
|
||||
self.iter_raw(delete_bitset).map(|doc_bytes_res| {
|
||||
let mut doc_bytes = doc_bytes_res?;
|
||||
@@ -149,7 +149,7 @@ impl StoreReader {
|
||||
/// The delete_bitset has to be forwarded from the `SegmentReader` or the results maybe wrong.
|
||||
pub(crate) fn iter_raw<'a: 'b, 'b>(
|
||||
&'b self,
|
||||
delete_bitset: Option<&'a DeleteBitSet>,
|
||||
delete_bitset: Option<&'a AliveBitSet>,
|
||||
) -> impl Iterator<Item = crate::Result<OwnedBytes>> + 'b {
|
||||
let last_docid = self
|
||||
.block_checkpoints()
|
||||
|
||||
Reference in New Issue
Block a user