mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
calc mem_usage of more structs
calc mem_usage of more structs in index creation add some comments
This commit is contained in:
@@ -10,21 +10,24 @@ mod tests {
|
||||
fn bench_blockedbitp_read(b: &mut Bencher) {
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
for val in 0..=21500 {
|
||||
blocked_bitpacker.add(val);
|
||||
blocked_bitpacker.add(val * val);
|
||||
}
|
||||
b.iter(|| {
|
||||
let mut out = 0;
|
||||
for val in 0..=21500 {
|
||||
blocked_bitpacker.get(val);
|
||||
out = blocked_bitpacker.get(val);
|
||||
}
|
||||
out
|
||||
});
|
||||
}
|
||||
#[bench]
|
||||
fn bench_blockbitp_create(b: &mut Bencher) {
|
||||
fn bench_blockedbitp_create(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
for val in 0..=21500 {
|
||||
blocked_bitpacker.add(val);
|
||||
blocked_bitpacker.add(val * val);
|
||||
}
|
||||
blocked_bitpacker
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use super::{bitpacker::BitPacker, compute_num_bits};
|
||||
|
||||
const BLOCK_SIZE: usize = 128;
|
||||
|
||||
/// BlockedBitpacker compresses data in blocks of
|
||||
/// `BlockedBitpacker` compresses data in blocks of
|
||||
/// 128 elements, while keeping an index on it
|
||||
///
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -16,6 +16,12 @@ pub struct BlockedBitpacker {
|
||||
offset_and_bits: Vec<BlockedBitpackerEntryMetaData>,
|
||||
}
|
||||
|
||||
/// `BlockedBitpackerEntryMetaData` encodes the
|
||||
/// offset and bit_width into a u64 bit field
|
||||
///
|
||||
/// This saves some space, since 7byte is more
|
||||
/// than enough and also keeps the access fast
|
||||
/// because of alignment
|
||||
#[derive(Debug, Clone, Default)]
|
||||
struct BlockedBitpackerEntryMetaData {
|
||||
encoded: u64,
|
||||
@@ -23,7 +29,7 @@ struct BlockedBitpackerEntryMetaData {
|
||||
|
||||
impl BlockedBitpackerEntryMetaData {
|
||||
fn new(offset: u64, num_bits: u8) -> Self {
|
||||
let encoded = offset | (num_bits as u64) << 56;
|
||||
let encoded = offset | (num_bits as u64) << (64 - 8);
|
||||
Self { encoded }
|
||||
}
|
||||
fn offset(&self) -> u64 {
|
||||
@@ -33,6 +39,7 @@ impl BlockedBitpackerEntryMetaData {
|
||||
(self.encoded >> 56) as u8
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn metadata_test() {
|
||||
let meta = BlockedBitpackerEntryMetaData::new(50000, 6);
|
||||
@@ -51,8 +58,10 @@ impl BlockedBitpacker {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_memory_usage(&self) -> usize {
|
||||
self.compressed_blocks.capacity()
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
std::mem::size_of::<BlockedBitpacker>()
|
||||
+ self.compressed_blocks.capacity()
|
||||
+ self.offset_and_bits.capacity()
|
||||
* std::mem::size_of_val(&self.offset_and_bits.get(0).cloned().unwrap_or_default())
|
||||
+ self.cache.capacity()
|
||||
@@ -80,6 +89,10 @@ impl BlockedBitpacker {
|
||||
self.compressed_blocks
|
||||
.resize(self.compressed_blocks.len() - 8, 0); // remove padding for bitpacker
|
||||
let offset = self.compressed_blocks.len() as u64;
|
||||
// todo performance: for some bit_width we
|
||||
// can encode multiple vals into the
|
||||
// mini_buffer before checking to flush
|
||||
// (to be done in BitPacker)
|
||||
for val in self.cache.iter() {
|
||||
bit_packer
|
||||
.write(*val, num_bits_block, &mut self.compressed_blocks)
|
||||
@@ -108,6 +121,7 @@ impl BlockedBitpacker {
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
|
||||
// todo performance: we could decompress the whole block and cache it instead
|
||||
let bitpacked_elems = self.offset_and_bits.len() * BLOCK_SIZE;
|
||||
let iter = (0..bitpacked_elems)
|
||||
.map(move |idx| self.get(idx))
|
||||
|
||||
@@ -14,7 +14,7 @@ pub use crate::blocked_bitpacker::BlockedBitpacker;
|
||||
///
|
||||
/// The logic is slightly more convoluted here as for optimization
|
||||
/// reasons, we want to ensure that a value spawns over at most 8 bytes
|
||||
/// of aligns bytes.
|
||||
/// of aligned bytes.
|
||||
///
|
||||
/// Spanning over 9 bytes is possible for instance, if we do
|
||||
/// bitpacking with an amplitude of 63 bits.
|
||||
|
||||
@@ -35,6 +35,10 @@ impl BytesFastFieldWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.vals.capacity() + self.doc_index.capacity() * std::mem::size_of::<u64>()
|
||||
}
|
||||
/// Access the field associated to the `BytesFastFieldWriter`
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
|
||||
@@ -48,6 +48,12 @@ impl MultiValuedFastFieldWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.vals.capacity() * std::mem::size_of::<UnorderedTermId>()
|
||||
+ self.doc_index.capacity() * std::mem::size_of::<u64>()
|
||||
}
|
||||
|
||||
/// Access the field associated to the `MultiValuedFastFieldWriter`
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
use super::multivalued::MultiValuedFastFieldWriter;
|
||||
use crate::common;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::VInt;
|
||||
use crate::fastfield::{BytesFastFieldWriter, FastFieldSerializer};
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
|
||||
@@ -73,6 +71,24 @@ impl FastFieldsWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.single_value_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.multi_values_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.bytes_value_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
@@ -178,6 +194,11 @@ impl IntFastFieldWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.vals.mem_usage()
|
||||
}
|
||||
|
||||
/// Returns the field that this writer is targetting.
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
|
||||
@@ -50,6 +50,13 @@ impl FieldNormsWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used inclusive childs
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.fieldnorms_buffer
|
||||
.iter()
|
||||
.map(|buf| buf.capacity())
|
||||
.sum()
|
||||
}
|
||||
/// Ensure that all documents in 0..max_doc have a byte associated with them
|
||||
/// in each of the fieldnorm vectors.
|
||||
///
|
||||
|
||||
@@ -111,6 +111,8 @@ impl SegmentWriter {
|
||||
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.multifield_postings.mem_usage()
|
||||
+ self.fieldnorms_writer.mem_usage()
|
||||
+ self.fast_field_writers.mem_usage()
|
||||
}
|
||||
|
||||
/// Indexes a new document
|
||||
|
||||
@@ -45,6 +45,11 @@ impl StoreWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.intermediary_buffer.capacity() + self.current_block.capacity()
|
||||
}
|
||||
|
||||
/// Store a new document.
|
||||
///
|
||||
/// The document id is implicitely the number of times
|
||||
|
||||
Reference in New Issue
Block a user