mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-25 20:50:43 +00:00
create and use ReadSerializedBitSet
This commit is contained in:
@@ -10,6 +10,7 @@ description = "common traits and utility functions used by multiple tantivy subc
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
ownedbytes = { version="0.1", path="../ownedbytes" }
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.0.0"
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use ownedbytes::OwnedBytes;
|
||||
use std::convert::TryInto;
|
||||
use std::io::Write;
|
||||
use std::u64;
|
||||
@@ -64,34 +65,28 @@ impl TinySet {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns true iff the `TinySet` bit is set at position `pos`.
|
||||
pub fn contains(self, pos: u32) -> bool {
|
||||
!self.intersect(TinySet::singleton(pos)).is_empty()
|
||||
/// Returns true iff the `TinySet` contains the element `el`.
|
||||
pub fn contains(self, el: u32) -> bool {
|
||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the number of set bits in the TinySet.
|
||||
pub fn num_set(self) -> u32 {
|
||||
/// Returns the number of elements in the TinySet.
|
||||
pub fn len(self) -> u32 {
|
||||
self.0.count_ones()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the number of unset bits in the TinySet.
|
||||
pub fn num_unset(self) -> u32 {
|
||||
self.0.count_zeros()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the intersection of `self` and `other`
|
||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 & other.0)
|
||||
}
|
||||
|
||||
/// Creates a new `TinySet` with only one bit set at `pos`.
|
||||
/// Creates a new `TinySet` containing only one element
|
||||
/// within `[0; 64[`
|
||||
#[inline]
|
||||
pub fn singleton(pos: u32) -> TinySet {
|
||||
TinySet(1u64 << u64::from(pos))
|
||||
pub fn singleton(el: u32) -> TinySet {
|
||||
TinySet(1u64 << u64::from(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64)
|
||||
@@ -108,7 +103,7 @@ impl TinySet {
|
||||
|
||||
/// Insert a new element within [0..64)
|
||||
///
|
||||
/// returns true if the bit changed
|
||||
/// returns true if the set changed
|
||||
#[inline]
|
||||
pub fn insert_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
@@ -116,9 +111,9 @@ impl TinySet {
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Remove a new element within [0..64)
|
||||
/// Remove a element within [0..64)
|
||||
///
|
||||
/// returns true if the bit changed
|
||||
/// returns true if the set changed
|
||||
#[inline]
|
||||
pub fn remove_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
@@ -203,7 +198,7 @@ impl BitSet {
|
||||
let mut tinysets = vec![];
|
||||
for chunk in data.chunks_exact(8) {
|
||||
let tinyset = TinySet::deserialize(chunk.try_into().unwrap())?;
|
||||
len += tinyset.num_set() as u64;
|
||||
len += tinyset.len() as u64;
|
||||
tinysets.push(tinyset);
|
||||
}
|
||||
Ok(BitSet {
|
||||
@@ -213,43 +208,6 @@ impl BitSet {
|
||||
})
|
||||
}
|
||||
|
||||
/// Count the number of unset bits from serialized data.
|
||||
///
|
||||
#[inline]
|
||||
pub fn count_unset_from_bytes<'a>(data: &'a [u8]) -> usize {
|
||||
BitSet::iter_tinysets_from_bytes(data)
|
||||
.map(|tinyset| tinyset.num_unset() as usize)
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// Iterate the tinyset on the fly from serialized data.
|
||||
///
|
||||
#[inline]
|
||||
fn iter_tinysets_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = TinySet> + 'a {
|
||||
assert!((data.len() - 4) % 8 == 0);
|
||||
data[4..].chunks_exact(8).map(move |chunk| {
|
||||
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap();
|
||||
tinyset
|
||||
})
|
||||
}
|
||||
|
||||
/// Iterate over the positions of the unset elements.
|
||||
///
|
||||
/// max_val needs to be provided, since the last 64 bits may
|
||||
#[inline]
|
||||
pub fn iter_unset_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = u32> + 'a {
|
||||
let max_val: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
|
||||
Self::iter_tinysets_from_bytes(data)
|
||||
.enumerate()
|
||||
.flat_map(move |(chunk_num, tinyset)| {
|
||||
let chunk_base_val = chunk_num as u32 * 64;
|
||||
tinyset
|
||||
.into_iter()
|
||||
.map(move |val| val + chunk_base_val)
|
||||
.take_while(move |doc| *doc < max_val)
|
||||
})
|
||||
}
|
||||
|
||||
/// Create a new `BitSet` that may contain elements
|
||||
/// within `[0, max_val)`.
|
||||
pub fn with_max_value(max_value: u32) -> BitSet {
|
||||
@@ -262,9 +220,9 @@ impl BitSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `BitSet` that may contain elements
|
||||
/// Create a new `BitSet` that may contain elements. Initially all values will be set.
|
||||
/// within `[0, max_val)`.
|
||||
pub fn with_max_value_and_filled(max_value: u32) -> BitSet {
|
||||
pub fn with_max_value_and_full(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let tinybisets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice();
|
||||
BitSet {
|
||||
@@ -282,11 +240,12 @@ impl BitSet {
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the `BitSet`.
|
||||
pub fn num_set_bits(&self) -> usize {
|
||||
pub fn len(&self) -> usize {
|
||||
self.len as usize
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
#[inline]
|
||||
pub fn insert(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
@@ -299,6 +258,7 @@ impl BitSet {
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
#[inline]
|
||||
pub fn remove(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
@@ -312,14 +272,6 @@ impl BitSet {
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
#[inline]
|
||||
pub fn contains_from_bytes(el: u32, data: &[u8]) -> bool {
|
||||
let byte_offset = 4 + el / 8u32;
|
||||
let b: u8 = data[byte_offset as usize];
|
||||
let shift = (el % 8) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
self.tinyset(el / 64u32).contains(el % 64)
|
||||
}
|
||||
@@ -349,17 +301,133 @@ impl BitSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Lazy Read a serialized BitSet.
|
||||
#[derive(Clone)]
|
||||
pub struct ReadSerializedBitSet {
|
||||
data: OwnedBytes,
|
||||
max_value: u32,
|
||||
}
|
||||
|
||||
impl ReadSerializedBitSet {
|
||||
pub fn new(data: OwnedBytes) -> Self {
|
||||
let (max_value_data, data) = data.split(4);
|
||||
let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap());
|
||||
ReadSerializedBitSet { data, max_value }
|
||||
}
|
||||
|
||||
/// Count the number of unset bits from serialized data.
|
||||
///
|
||||
#[inline]
|
||||
pub fn count_unset(&self) -> usize {
|
||||
let lower = self.max_value % 64u32;
|
||||
|
||||
let num_set: usize = self
|
||||
.iter_tinysets()
|
||||
.map(|(tinyset, is_last)| {
|
||||
if is_last {
|
||||
tinyset.intersect(TinySet::range_lower(lower)).len() as usize
|
||||
} else {
|
||||
tinyset.len() as usize
|
||||
}
|
||||
})
|
||||
.sum();
|
||||
self.max_value as usize - num_set
|
||||
}
|
||||
|
||||
/// Iterate the tinyset on the fly from serialized data.
|
||||
///
|
||||
/// Iterator returns (TinySet, is_last) element, so the consumer can ignore up to max_doc in the
|
||||
/// last block.
|
||||
///
|
||||
#[inline]
|
||||
fn iter_tinysets<'a>(&'a self) -> impl Iterator<Item = (TinySet, bool)> + 'a {
|
||||
assert!((self.data.len()) % 8 == 0);
|
||||
self.data
|
||||
.chunks_exact(8)
|
||||
.enumerate()
|
||||
.map(move |(chunk_num, chunk)| {
|
||||
let is_last = (chunk_num + 1) * 8 == self.data.len();
|
||||
|
||||
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap();
|
||||
(tinyset, is_last)
|
||||
})
|
||||
}
|
||||
|
||||
/// Iterate over the positions of the unset elements.
|
||||
///
|
||||
#[inline]
|
||||
pub fn iter_unset<'a>(&'a self) -> impl Iterator<Item = u32> + 'a {
|
||||
self.iter_tinysets()
|
||||
.enumerate()
|
||||
.flat_map(move |(chunk_num, (tinyset, _))| {
|
||||
let chunk_base_val = chunk_num as u32 * 64;
|
||||
tinyset
|
||||
.into_iter()
|
||||
.map(move |val| val + chunk_base_val)
|
||||
.take_while(move |doc| *doc < self.max_value)
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
#[inline]
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
let byte_offset = el / 8u32;
|
||||
let b: u8 = self.data[byte_offset as usize];
|
||||
let shift = (el % 8) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
|
||||
/// Returns the max_value.
|
||||
#[inline]
|
||||
pub fn max_value(&self) -> u32 {
|
||||
self.max_value
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::BitSet;
|
||||
use super::ReadSerializedBitSet;
|
||||
use super::TinySet;
|
||||
use ownedbytes::OwnedBytes;
|
||||
use rand::distributions::Bernoulli;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use std::collections::HashSet;
|
||||
use std::convert::TryInto;
|
||||
|
||||
#[test]
|
||||
fn test_read_serialized_bitset_full() {
|
||||
let mut bitset = BitSet::with_max_value_and_full(5);
|
||||
bitset.remove(3);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadSerializedBitSet::new(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.count_unset(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_serialized_bitset_empty() {
|
||||
let mut bitset = BitSet::with_max_value(5);
|
||||
bitset.insert(3);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadSerializedBitSet::new(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.count_unset(), 4);
|
||||
|
||||
{
|
||||
let bitset = BitSet::with_max_value(5);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadSerializedBitSet::new(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.count_unset(), 5);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set_remove() {
|
||||
{
|
||||
@@ -452,7 +520,7 @@ mod tests {
|
||||
assert_eq!(hashset.contains(&el), bitset.contains(el));
|
||||
}
|
||||
assert_eq!(bitset.max_value(), max_value);
|
||||
assert_eq!(bitset.num_set_bits(), els.len());
|
||||
assert_eq!(bitset.len(), els.len());
|
||||
};
|
||||
|
||||
test_against_hashset(&[], 0);
|
||||
@@ -506,25 +574,25 @@ mod tests {
|
||||
#[test]
|
||||
fn test_bitset_len() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
assert_eq!(bitset.num_set_bits(), 0);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.num_set_bits(), 1);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.num_set_bits(), 2);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.num_set_bits(), 2);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.num_set_bits(), 2);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(104u32);
|
||||
assert_eq!(bitset.num_set_bits(), 3);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
bitset.remove(105u32);
|
||||
assert_eq!(bitset.num_set_bits(), 3);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
bitset.remove(104u32);
|
||||
assert_eq!(bitset.num_set_bits(), 2);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.remove(3u32);
|
||||
assert_eq!(bitset.num_set_bits(), 1);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
bitset.remove(103u32);
|
||||
assert_eq!(bitset.num_set_bits(), 0);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
}
|
||||
|
||||
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||
|
||||
@@ -73,7 +73,7 @@ impl SegmentReader {
|
||||
/// deleted in the segment.
|
||||
pub fn num_deleted_docs(&self) -> DocId {
|
||||
self.alive_bitset()
|
||||
.map(|delete_set| delete_set.num_deleted() as DocId)
|
||||
.map(|alive_set| alive_set.num_deleted() as DocId)
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
|
||||
@@ -289,7 +289,7 @@ impl SegmentReader {
|
||||
/// Returns an iterator that will iterate over the alive document ids
|
||||
pub fn doc_ids_alive(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
|
||||
if let Some(alive_bitset) = &self.alive_bitset_opt {
|
||||
Box::new(alive_bitset.iter_unset())
|
||||
Box::new(alive_bitset.iter_alive())
|
||||
} else {
|
||||
Box::new(0u32..self.max_doc)
|
||||
}
|
||||
|
||||
@@ -89,7 +89,7 @@ pub trait DocSet: Send {
|
||||
let mut count = 0u32;
|
||||
let mut doc = self.doc();
|
||||
while doc != TERMINATED {
|
||||
if !alive_bitset.is_deleted(doc) {
|
||||
if alive_bitset.is_alive(doc) {
|
||||
count += 1u32;
|
||||
}
|
||||
doc = self.advance();
|
||||
|
||||
@@ -3,6 +3,7 @@ use crate::directory::OwnedBytes;
|
||||
use crate::space_usage::ByteCount;
|
||||
use crate::DocId;
|
||||
use common::BitSet;
|
||||
use common::ReadSerializedBitSet;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
@@ -21,16 +22,17 @@ pub fn write_alive_bitset<T: Write>(alive_bitset: &BitSet, writer: &mut T) -> io
|
||||
pub struct AliveBitSet {
|
||||
data: OwnedBytes,
|
||||
num_deleted: usize,
|
||||
bitset: ReadSerializedBitSet,
|
||||
}
|
||||
|
||||
impl AliveBitSet {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn for_test(not_alive_docs: &[DocId], max_doc: u32) -> AliveBitSet {
|
||||
pub(crate) fn for_test(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet {
|
||||
use crate::directory::{Directory, RamDirectory, TerminatingWrite};
|
||||
use std::path::Path;
|
||||
assert!(not_alive_docs.iter().all(|&doc| doc < max_doc));
|
||||
let mut bitset = BitSet::with_max_value_and_filled(max_doc);
|
||||
for &doc in not_alive_docs {
|
||||
assert!(deleted_docs.iter().all(|&doc| doc < max_doc));
|
||||
let mut bitset = BitSet::with_max_value_and_full(max_doc);
|
||||
for &doc in deleted_docs {
|
||||
bitset.remove(doc);
|
||||
}
|
||||
let directory = RamDirectory::create();
|
||||
@@ -45,32 +47,38 @@ impl AliveBitSet {
|
||||
/// Opens a delete bitset given its file.
|
||||
pub fn open(file: FileSlice) -> crate::Result<AliveBitSet> {
|
||||
let bytes = file.read_bytes()?;
|
||||
let num_deleted = BitSet::count_unset_from_bytes(bytes.as_slice());
|
||||
let bitset = ReadSerializedBitSet::new(bytes.clone());
|
||||
let num_deleted = bitset.count_unset();
|
||||
|
||||
Ok(AliveBitSet {
|
||||
data: bytes,
|
||||
num_deleted,
|
||||
bitset,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
|
||||
#[inline]
|
||||
pub fn is_alive(&self, doc: DocId) -> bool {
|
||||
!self.is_deleted(doc)
|
||||
self.bitset.contains(doc)
|
||||
}
|
||||
|
||||
/// Returns true iff the document has been marked as deleted.
|
||||
#[inline]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
let data = self.data.as_slice();
|
||||
!BitSet::contains_from_bytes(doc, data)
|
||||
!self.is_alive(doc)
|
||||
}
|
||||
|
||||
/// Iterate over the positions of the set elements
|
||||
/// Iterate over the alive docids.
|
||||
#[inline]
|
||||
pub fn iter_unset(&self) -> impl Iterator<Item = u32> + '_ {
|
||||
let data = self.data.as_slice();
|
||||
BitSet::iter_unset_from_bytes(data)
|
||||
pub fn iter_alive(&self) -> impl Iterator<Item = DocId> + '_ {
|
||||
self.bitset.iter_unset()
|
||||
}
|
||||
|
||||
/// Get underlying bitset
|
||||
#[inline]
|
||||
pub fn bitset(&self) -> &ReadSerializedBitSet {
|
||||
&self.bitset
|
||||
}
|
||||
|
||||
/// The number of deleted docs
|
||||
@@ -121,7 +129,7 @@ mod tests {
|
||||
fn test_alive_bitset_iter_minimal() {
|
||||
let alive_bitset = AliveBitSet::for_test(&[7], 8);
|
||||
|
||||
let data: Vec<_> = alive_bitset.iter_unset().collect();
|
||||
let data: Vec<_> = alive_bitset.iter_alive().collect();
|
||||
assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]);
|
||||
}
|
||||
|
||||
@@ -129,14 +137,14 @@ mod tests {
|
||||
fn test_alive_bitset_iter_small() {
|
||||
let alive_bitset = AliveBitSet::for_test(&[0, 2, 3, 6], 7);
|
||||
|
||||
let data: Vec<_> = alive_bitset.iter_unset().collect();
|
||||
let data: Vec<_> = alive_bitset.iter_alive().collect();
|
||||
assert_eq!(data, vec![1, 4, 5]);
|
||||
}
|
||||
#[test]
|
||||
fn test_alive_bitset_iter() {
|
||||
let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000], 1001);
|
||||
|
||||
let data: Vec<_> = alive_bitset.iter_unset().collect();
|
||||
let data: Vec<_> = alive_bitset.iter_alive().collect();
|
||||
assert_eq!(data, (2..=999).collect::<Vec<_>>());
|
||||
}
|
||||
}
|
||||
@@ -166,7 +174,7 @@ mod bench {
|
||||
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| alive_bitset.iter_unset().collect::<Vec<_>>());
|
||||
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
@@ -184,7 +192,7 @@ mod bench {
|
||||
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000);
|
||||
|
||||
bench.iter(|| alive_bitset.iter_unset().collect::<Vec<_>>());
|
||||
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
|
||||
@@ -151,7 +151,7 @@ pub(crate) fn advance_deletes(
|
||||
let max_doc = segment_reader.max_doc();
|
||||
let mut alive_bitset: BitSet = match segment_entry.alive_bitset() {
|
||||
Some(previous_alive_bitset) => (*previous_alive_bitset).clone(),
|
||||
None => BitSet::with_max_value_and_filled(max_doc),
|
||||
None => BitSet::with_max_value_and_full(max_doc),
|
||||
};
|
||||
|
||||
let num_deleted_docs_before = segment.meta().num_deleted_docs();
|
||||
@@ -175,7 +175,7 @@ pub(crate) fn advance_deletes(
|
||||
}
|
||||
}
|
||||
|
||||
let num_alive_docs: u32 = alive_bitset.num_set_bits() as u32;
|
||||
let num_alive_docs: u32 = alive_bitset.len() as u32;
|
||||
let num_deleted_docs = max_doc - num_alive_docs;
|
||||
if num_deleted_docs > num_deleted_docs_before {
|
||||
// There are new deletes. We need to write a new delete file.
|
||||
@@ -259,7 +259,7 @@ fn apply_deletes(
|
||||
let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps);
|
||||
|
||||
let max_doc = segment.meta().max_doc();
|
||||
let mut deleted_bitset = BitSet::with_max_value_and_filled(max_doc);
|
||||
let mut deleted_bitset = BitSet::with_max_value_and_full(max_doc);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
|
||||
@@ -99,22 +99,21 @@ fn compute_min_max_val(
|
||||
segment_reader: &SegmentReader,
|
||||
) -> Option<(u64, u64)> {
|
||||
if segment_reader.max_doc() == 0 {
|
||||
None
|
||||
} else {
|
||||
if segment_reader.alive_bitset().is_some() {
|
||||
// some deleted documents,
|
||||
// we need to recompute the max / min
|
||||
minmax(
|
||||
segment_reader
|
||||
.doc_ids_alive()
|
||||
.map(|doc_id| u64_reader.get(doc_id)),
|
||||
)
|
||||
} else {
|
||||
// no deleted documents,
|
||||
// we can use the previous min_val, max_val.
|
||||
Some((u64_reader.min_value(), u64_reader.max_value()))
|
||||
}
|
||||
return None;
|
||||
}
|
||||
|
||||
if segment_reader.alive_bitset().is_none() {
|
||||
// no deleted documents,
|
||||
// we can use the previous min_val, max_val.
|
||||
return Some((u64_reader.min_value(), u64_reader.max_value()));
|
||||
}
|
||||
// some deleted documents,
|
||||
// we need to recompute the max / min
|
||||
minmax(
|
||||
segment_reader
|
||||
.doc_ids_alive()
|
||||
.map(|doc_id| u64_reader.get(doc_id)),
|
||||
)
|
||||
}
|
||||
|
||||
struct TermOrdinalMapping {
|
||||
|
||||
@@ -90,7 +90,7 @@ impl DocSet for BitSetDocSet {
|
||||
/// but we don't have access to any better
|
||||
/// value.
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.docs.num_set_bits() as u32
|
||||
self.docs.len() as u32
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,7 +124,7 @@ mod tests {
|
||||
for i in 0..100_000 {
|
||||
assert_eq!(btreeset.contains(&i), bitset.contains(i));
|
||||
}
|
||||
assert_eq!(btreeset.len(), bitset.num_set_bits());
|
||||
assert_eq!(btreeset.len(), bitset.len());
|
||||
let mut bitset_docset = BitSetDocSet::from(bitset);
|
||||
let mut remaining = true;
|
||||
for el in btreeset.into_iter() {
|
||||
|
||||
@@ -219,18 +219,14 @@ where
|
||||
}
|
||||
let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
|
||||
.iter()
|
||||
.map(|bitset| bitset.num_set())
|
||||
.map(|bitset| bitset.len())
|
||||
.sum::<u32>()
|
||||
+ 1;
|
||||
for bitset in self.bitsets.iter_mut() {
|
||||
bitset.clear();
|
||||
}
|
||||
while self.refill() {
|
||||
count += self
|
||||
.bitsets
|
||||
.iter()
|
||||
.map(|bitset| bitset.num_set())
|
||||
.sum::<u32>();
|
||||
count += self.bitsets.iter().map(|bitset| bitset.len()).sum::<u32>();
|
||||
for bitset in self.bitsets.iter_mut() {
|
||||
bitset.clear();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user