calc mem_usage of more structs

calc mem_usage of more structs in index creation
add some comments
This commit is contained in:
Pascal Seitz
2021-04-30 14:16:39 +02:00
parent 83cf638a2e
commit 25b9429929
9 changed files with 73 additions and 11 deletions

View File

@@ -10,21 +10,24 @@ mod tests {
fn bench_blockedbitp_read(b: &mut Bencher) {
let mut blocked_bitpacker = BlockedBitpacker::new();
for val in 0..=21500 {
blocked_bitpacker.add(val);
blocked_bitpacker.add(val * val);
}
b.iter(|| {
let mut out = 0;
for val in 0..=21500 {
blocked_bitpacker.get(val);
out = blocked_bitpacker.get(val);
}
out
});
}
#[bench]
fn bench_blockbitp_create(b: &mut Bencher) {
fn bench_blockedbitp_create(b: &mut Bencher) {
b.iter(|| {
let mut blocked_bitpacker = BlockedBitpacker::new();
for val in 0..=21500 {
blocked_bitpacker.add(val);
blocked_bitpacker.add(val * val);
}
blocked_bitpacker
});
}
}

View File

@@ -4,7 +4,7 @@ use super::{bitpacker::BitPacker, compute_num_bits};
const BLOCK_SIZE: usize = 128;
/// BlockedBitpacker compresses data in blocks of
/// `BlockedBitpacker` compresses data in blocks of
/// 128 elements, while keeping an index on it
///
#[derive(Debug, Clone)]
@@ -16,6 +16,12 @@ pub struct BlockedBitpacker {
offset_and_bits: Vec<BlockedBitpackerEntryMetaData>,
}
/// `BlockedBitpackerEntryMetaData` encodes the
/// offset and bit_width into a u64 bit field
///
/// This saves some space, since 7byte is more
/// than enough and also keeps the access fast
/// because of alignment
#[derive(Debug, Clone, Default)]
struct BlockedBitpackerEntryMetaData {
encoded: u64,
@@ -23,7 +29,7 @@ struct BlockedBitpackerEntryMetaData {
impl BlockedBitpackerEntryMetaData {
fn new(offset: u64, num_bits: u8) -> Self {
let encoded = offset | (num_bits as u64) << 56;
let encoded = offset | (num_bits as u64) << (64 - 8);
Self { encoded }
}
fn offset(&self) -> u64 {
@@ -33,6 +39,7 @@ impl BlockedBitpackerEntryMetaData {
(self.encoded >> 56) as u8
}
}
#[test]
fn metadata_test() {
let meta = BlockedBitpackerEntryMetaData::new(50000, 6);
@@ -51,8 +58,10 @@ impl BlockedBitpacker {
}
}
pub fn get_memory_usage(&self) -> usize {
self.compressed_blocks.capacity()
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
std::mem::size_of::<BlockedBitpacker>()
+ self.compressed_blocks.capacity()
+ self.offset_and_bits.capacity()
* std::mem::size_of_val(&self.offset_and_bits.get(0).cloned().unwrap_or_default())
+ self.cache.capacity()
@@ -80,6 +89,10 @@ impl BlockedBitpacker {
self.compressed_blocks
.resize(self.compressed_blocks.len() - 8, 0); // remove padding for bitpacker
let offset = self.compressed_blocks.len() as u64;
// todo performance: for some bit_width we
// can encode multiple vals into the
// mini_buffer before checking to flush
// (to be done in BitPacker)
for val in self.cache.iter() {
bit_packer
.write(*val, num_bits_block, &mut self.compressed_blocks)
@@ -108,6 +121,7 @@ impl BlockedBitpacker {
}
pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
// todo performance: we could decompress the whole block and cache it instead
let bitpacked_elems = self.offset_and_bits.len() * BLOCK_SIZE;
let iter = (0..bitpacked_elems)
.map(move |idx| self.get(idx))

View File

@@ -14,7 +14,7 @@ pub use crate::blocked_bitpacker::BlockedBitpacker;
///
/// The logic is slightly more convoluted here as for optimization
/// reasons, we want to ensure that a value spawns over at most 8 bytes
/// of aligns bytes.
/// of aligned bytes.
///
/// Spanning over 9 bytes is possible for instance, if we do
/// bitpacking with an amplitude of 63 bits.

View File

@@ -35,6 +35,10 @@ impl BytesFastFieldWriter {
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.vals.capacity() + self.doc_index.capacity() * std::mem::size_of::<u64>()
}
/// Access the field associated to the `BytesFastFieldWriter`
pub fn field(&self) -> Field {
self.field

View File

@@ -48,6 +48,12 @@ impl MultiValuedFastFieldWriter {
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.vals.capacity() * std::mem::size_of::<UnorderedTermId>()
+ self.doc_index.capacity() * std::mem::size_of::<u64>()
}
/// Access the field associated to the `MultiValuedFastFieldWriter`
pub fn field(&self) -> Field {
self.field

View File

@@ -1,7 +1,5 @@
use super::multivalued::MultiValuedFastFieldWriter;
use crate::common;
use crate::common::BinarySerializable;
use crate::common::VInt;
use crate::fastfield::{BytesFastFieldWriter, FastFieldSerializer};
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
@@ -73,6 +71,24 @@ impl FastFieldsWriter {
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.single_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.multi_values_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.bytes_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
}
/// Get the `FastFieldWriter` associated to a field.
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
// TODO optimize
@@ -178,6 +194,11 @@ impl IntFastFieldWriter {
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.vals.mem_usage()
}
/// Returns the field that this writer is targetting.
pub fn field(&self) -> Field {
self.field

View File

@@ -50,6 +50,13 @@ impl FieldNormsWriter {
}
}
/// The memory used inclusive childs
pub fn mem_usage(&self) -> usize {
self.fieldnorms_buffer
.iter()
.map(|buf| buf.capacity())
.sum()
}
/// Ensure that all documents in 0..max_doc have a byte associated with them
/// in each of the fieldnorm vectors.
///

View File

@@ -111,6 +111,8 @@ impl SegmentWriter {
pub fn mem_usage(&self) -> usize {
self.multifield_postings.mem_usage()
+ self.fieldnorms_writer.mem_usage()
+ self.fast_field_writers.mem_usage()
}
/// Indexes a new document

View File

@@ -45,6 +45,11 @@ impl StoreWriter {
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.intermediary_buffer.capacity() + self.current_block.capacity()
}
/// Store a new document.
///
/// The document id is implicitely the number of times