Merge branch 'main' into issue-more-like-this-query

This commit is contained in:
Evance Soumaoro
2021-05-03 10:08:38 +00:00
committed by GitHub
20 changed files with 367 additions and 94 deletions

View File

@@ -10,6 +10,7 @@ Tantivy 0.15.0
- Added lz4-flex as the default compression scheme in tantivy (@PSeitz) #1009
- Renamed a lot of symbols to avoid all uppercasing on acronyms, as per new clippy recommendation. For instance, RAMDirectory -> RamDirectory. (@pmasurel)
- Simplified positions index format (@fulmicoton) #1022
- Moved bitpacking to bitpacker subcrate and add BlockedBitpacker, which bitpacks blocks of 128 elements (@PSeitz) #1030
- Added support for more-like-this query in tantivy (@evanxg852000) #1011
Tantivy 0.14.0

View File

@@ -35,6 +35,7 @@ uuid = { version = "0.8", features = ["v4", "serde"] }
crossbeam = "0.8"
futures = {version = "0.3", features=["thread-pool"] }
tantivy-query-grammar = { version="0.14.0", path="./query-grammar" }
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
stable_deref_trait = "1"
rust-stemmers = "1"
downcast-rs = "1"
@@ -86,7 +87,7 @@ unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"]
[workspace]
members = ["query-grammar"]
members = ["query-grammar", "bitpacker"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }

View File

@@ -38,7 +38,7 @@ Your mileage WILL vary depending on the nature of queries and their load.
# Features
- Full-text search
- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy) and [tantivy-tokenizer-tiny-segmente](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy) and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
- Tiny startup time (<10ms), perfect for command line tools
- BM25 scoring (the same as Lucene)

8
bitpacker/Cargo.toml Normal file
View File

@@ -0,0 +1,8 @@
[package]
name = "tantivy-bitpacker"
version = "0.1.0"
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]

View File

@@ -0,0 +1,33 @@
#![feature(test)]
extern crate test;
#[cfg(test)]
mod tests {
use tantivy_bitpacker::BlockedBitpacker;
use test::Bencher;
#[bench]
fn bench_blockedbitp_read(b: &mut Bencher) {
let mut blocked_bitpacker = BlockedBitpacker::new();
for val in 0..=21500 {
blocked_bitpacker.add(val * val);
}
b.iter(|| {
let mut out = 0;
for val in 0..=21500 {
out = blocked_bitpacker.get(val);
}
out
});
}
#[bench]
fn bench_blockedbitp_create(b: &mut Bencher) {
b.iter(|| {
let mut blocked_bitpacker = BlockedBitpacker::new();
for val in 0..=21500 {
blocked_bitpacker.add(val * val);
}
blocked_bitpacker
});
}
}

View File

@@ -1,9 +1,6 @@
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
use std::io;
use std::{convert::TryInto, io};
use crate::directory::OwnedBytes;
pub(crate) struct BitPacker {
pub struct BitPacker {
mini_buffer: u64,
mini_buffer_written: usize,
}
@@ -26,14 +23,14 @@ impl BitPacker {
let num_bits = num_bits as usize;
if self.mini_buffer_written + num_bits > 64 {
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
output.write_u64::<LittleEndian>(self.mini_buffer)?;
output.write_all(self.mini_buffer.to_le_bytes().as_ref())?;
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
} else {
self.mini_buffer |= val_u64 << self.mini_buffer_written;
self.mini_buffer_written += num_bits;
if self.mini_buffer_written == 64 {
output.write_u64::<LittleEndian>(self.mini_buffer)?;
output.write_all(self.mini_buffer.to_le_bytes().as_ref())?;
self.mini_buffer_written = 0;
self.mini_buffer = 0u64;
}
@@ -44,9 +41,8 @@ impl BitPacker {
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 {
let num_bytes = (self.mini_buffer_written + 7) / 8;
let mut arr: [u8; 8] = [0u8; 8];
LittleEndian::write_u64(&mut arr, self.mini_buffer);
output.write_all(&arr[..num_bytes])?;
let bytes = self.mini_buffer.to_le_bytes();
output.write_all(&bytes[..num_bytes])?;
self.mini_buffer_written = 0;
}
Ok(())
@@ -64,11 +60,10 @@ impl BitPacker {
pub struct BitUnpacker {
num_bits: u64,
mask: u64,
data: OwnedBytes,
}
impl BitUnpacker {
pub fn new(data: OwnedBytes, num_bits: u8) -> BitUnpacker {
pub fn new(num_bits: u8) -> BitUnpacker {
let mask: u64 = if num_bits == 64 {
!0u64
} else {
@@ -77,15 +72,13 @@ impl BitUnpacker {
BitUnpacker {
num_bits: u64::from(num_bits),
mask,
data,
}
}
pub fn get(&self, idx: u64) -> u64 {
pub fn get(&self, idx: u64, data: &[u8]) -> u64 {
if self.num_bits == 0 {
return 0u64;
}
let data: &[u8] = self.data.as_slice();
let num_bits = self.num_bits;
let mask = self.mask;
let addr_in_bits = idx * num_bits;
@@ -95,7 +88,10 @@ impl BitUnpacker {
addr + 8 <= data.len() as u64,
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[(addr as usize)..]);
let bytes: [u8; 8] = (&data[(addr as usize)..(addr as usize) + 8])
.try_into()
.unwrap();
let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes);
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
}
@@ -104,9 +100,8 @@ impl BitUnpacker {
#[cfg(test)]
mod test {
use super::{BitPacker, BitUnpacker};
use crate::directory::OwnedBytes;
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>) {
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>, Vec<u8>) {
let mut data = Vec::new();
let mut bitpacker = BitPacker::new();
let max_val: u64 = (1u64 << num_bits as u64) - 1u64;
@@ -118,14 +113,14 @@ mod test {
}
bitpacker.close(&mut data).unwrap();
assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7);
let bitunpacker = BitUnpacker::new(OwnedBytes::new(data), num_bits);
(bitunpacker, vals)
let bitunpacker = BitUnpacker::new(num_bits);
(bitunpacker, vals, data)
}
fn test_bitpacker_util(len: usize, num_bits: u8) {
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
let (bitunpacker, vals, data) = create_fastfield_bitpacker(len, num_bits);
for (i, val) in vals.iter().enumerate() {
assert_eq!(bitunpacker.get(i as u64), *val);
assert_eq!(bitunpacker.get(i as u64, &data), *val);
}
}

View File

@@ -0,0 +1,173 @@
use crate::{minmax, BitUnpacker};
use super::{bitpacker::BitPacker, compute_num_bits};
const BLOCK_SIZE: usize = 128;
/// `BlockedBitpacker` compresses data in blocks of
/// 128 elements, while keeping an index on it
///
#[derive(Debug, Clone)]
pub struct BlockedBitpacker {
// bitpacked blocks
compressed_blocks: Vec<u8>,
// uncompressed data, collected until BLOCK_SIZE
buffer: Vec<u64>,
offset_and_bits: Vec<BlockedBitpackerEntryMetaData>,
}
/// `BlockedBitpackerEntryMetaData` encodes the
/// offset and bit_width into a u64 bit field
///
/// This saves some space, since 7byte is more
/// than enough and also keeps the access fast
/// because of alignment
#[derive(Debug, Clone, Default)]
struct BlockedBitpackerEntryMetaData {
encoded: u64,
base_value: u64,
}
impl BlockedBitpackerEntryMetaData {
fn new(offset: u64, num_bits: u8, base_value: u64) -> Self {
let encoded = offset | (num_bits as u64) << (64 - 8);
Self {
encoded,
base_value,
}
}
fn offset(&self) -> u64 {
(self.encoded << 8) >> 8
}
fn num_bits(&self) -> u8 {
(self.encoded >> 56) as u8
}
fn base_value(&self) -> u64 {
self.base_value
}
}
#[test]
fn metadata_test() {
let meta = BlockedBitpackerEntryMetaData::new(50000, 6, 40000);
assert_eq!(meta.offset(), 50000);
assert_eq!(meta.num_bits(), 6);
}
impl BlockedBitpacker {
pub fn new() -> Self {
let mut compressed_blocks = vec![];
compressed_blocks.resize(8, 0);
Self {
compressed_blocks,
buffer: vec![],
offset_and_bits: vec![],
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
std::mem::size_of::<BlockedBitpacker>()
+ self.compressed_blocks.capacity()
+ self.offset_and_bits.capacity()
* std::mem::size_of_val(&self.offset_and_bits.get(0).cloned().unwrap_or_default())
+ self.buffer.capacity()
* std::mem::size_of_val(&self.buffer.get(0).cloned().unwrap_or_default())
}
pub fn add(&mut self, val: u64) {
self.buffer.push(val);
if self.buffer.len() == BLOCK_SIZE as usize {
self.flush();
}
}
pub fn flush(&mut self) {
if let Some((min_value, max_value)) = minmax(self.buffer.iter()) {
let mut bit_packer = BitPacker::new();
let num_bits_block = compute_num_bits(*max_value - min_value);
// todo performance: the padding handling could be done better, e.g. use a slice and
// return num_bytes written from bitpacker
self.compressed_blocks
.resize(self.compressed_blocks.len() - 8, 0); // remove padding for bitpacker
let offset = self.compressed_blocks.len() as u64;
// todo performance: for some bit_width we
// can encode multiple vals into the
// mini_buffer before checking to flush
// (to be done in BitPacker)
for val in self.buffer.iter() {
bit_packer
.write(
*val - min_value,
num_bits_block,
&mut self.compressed_blocks,
)
.expect("cannot write bitpacking to output"); // write to in memory can't fail
}
bit_packer.flush(&mut self.compressed_blocks).unwrap();
self.offset_and_bits
.push(BlockedBitpackerEntryMetaData::new(
offset,
num_bits_block,
*min_value,
));
self.buffer.clear();
self.compressed_blocks
.resize(self.compressed_blocks.len() + 8, 0); // add padding for bitpacker
} else {
return;
}
}
pub fn get(&self, idx: usize) -> u64 {
let metadata_pos = idx / BLOCK_SIZE as usize;
let pos_in_block = idx % BLOCK_SIZE as usize;
if let Some(metadata) = self.offset_and_bits.get(metadata_pos) {
let unpacked = BitUnpacker::new(metadata.num_bits()).get(
pos_in_block as u64,
&self.compressed_blocks[metadata.offset() as usize..],
);
unpacked + metadata.base_value()
} else {
self.buffer[pos_in_block]
}
}
pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
// todo performance: we could decompress a whole block and cache it instead
let bitpacked_elems = self.offset_and_bits.len() * BLOCK_SIZE;
let iter = (0..bitpacked_elems)
.map(move |idx| self.get(idx))
.chain(self.buffer.iter().cloned());
iter
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn blocked_bitpacker_empty() {
let blocked_bitpacker = BlockedBitpacker::new();
assert_eq!(blocked_bitpacker.iter().collect::<Vec<u64>>(), vec![]);
}
#[test]
fn blocked_bitpacker_one() {
let mut blocked_bitpacker = BlockedBitpacker::new();
blocked_bitpacker.add(50000);
assert_eq!(blocked_bitpacker.get(0), 50000);
assert_eq!(blocked_bitpacker.iter().collect::<Vec<u64>>(), vec![50000]);
}
#[test]
fn blocked_bitpacker_test() {
let mut blocked_bitpacker = BlockedBitpacker::new();
for val in 0..21500 {
blocked_bitpacker.add(val);
}
for val in 0..21500 {
assert_eq!(blocked_bitpacker.get(val as usize), val);
}
assert_eq!(blocked_bitpacker.iter().count(), 21500);
assert_eq!(blocked_bitpacker.iter().last().unwrap(), 21499);
}
}

52
bitpacker/src/lib.rs Normal file
View File

@@ -0,0 +1,52 @@
mod bitpacker;
mod blocked_bitpacker;
pub use crate::bitpacker::BitPacker;
pub use crate::bitpacker::BitUnpacker;
pub use crate::blocked_bitpacker::BlockedBitpacker;
/// Computes the number of bits that will be used for bitpacking.
///
/// In general the target is the minimum number of bits
/// required to express the amplitude given in argument.
///
/// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
///
/// The logic is slightly more convoluted here as for optimization
/// reasons, we want to ensure that a value spawns over at most 8 bytes
/// of aligned bytes.
///
/// Spanning over 9 bytes is possible for instance, if we do
/// bitpacking with an amplitude of 63 bits.
/// In this case, the second int will start on bit
/// 63 (which belongs to byte 7) and ends at byte 15;
/// Hence 9 bytes (from byte 7 to byte 15 included).
///
/// To avoid this, we force the number of bits to 64bits
/// when the result is greater than `64-8 = 56 bits`.
///
/// Note that this only affects rare use cases spawning over
/// a very large range of values. Even in this case, it results
/// in an extra cost of at most 12% compared to the optimal
/// number of bits.
pub fn compute_num_bits(n: u64) -> u8 {
let amplitude = (64u32 - n.leading_zeros()) as u8;
if amplitude <= 64 - 8 {
amplitude
} else {
64
}
}
pub fn minmax<I, T>(mut vals: I) -> Option<(T, T)>
where
I: Iterator<Item = T>,
T: Copy + Ord,
{
if let Some(first_el) = vals.next() {
return Some(vals.fold((first_el, first_el), |(min_val, max_val), el| {
(min_val.min(el), max_val.max(el))
}));
}
None
}

View File

@@ -1,4 +1,3 @@
pub mod bitpacker;
mod bitset;
mod composite_file;
mod counting_writer;
@@ -20,52 +19,6 @@ pub use byteorder::LittleEndian as Endianness;
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
pub fn minmax<I, T>(mut vals: I) -> Option<(T, T)>
where
I: Iterator<Item = T>,
T: Copy + Ord,
{
if let Some(first_el) = vals.next() {
return Some(vals.fold((first_el, first_el), |(min_val, max_val), el| {
(min_val.min(el), max_val.max(el))
}));
}
None
}
/// Computes the number of bits that will be used for bitpacking.
///
/// In general the target is the minimum number of bits
/// required to express the amplitude given in argument.
///
/// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
///
/// The logic is slightly more convoluted here as for optimization
/// reasons, we want to ensure that a value spawns over at most 8 bytes
/// of aligns bytes.
///
/// Spanning over 9 bytes is possible for instance, if we do
/// bitpacking with an amplitude of 63 bits.
/// In this case, the second int will start on bit
/// 63 (which belongs to byte 7) and ends at byte 15;
/// Hence 9 bytes (from byte 7 to byte 15 included).
///
/// To avoid this, we force the number of bits to 64bits
/// when the result is greater than `64-8 = 56 bits`.
///
/// Note that this only affects rare use cases spawning over
/// a very large range of values. Even in this case, it results
/// in an extra cost of at most 12% compared to the optimal
/// number of bits.
pub(crate) fn compute_num_bits(n: u64) -> u8 {
let amplitude = (64u32 - n.leading_zeros()) as u8;
if amplitude <= 64 - 8 {
amplitude
} else {
64
}
}
/// Has length trait
pub trait HasLen {
/// Return length
@@ -150,11 +103,12 @@ pub fn u64_to_f64(val: u64) -> f64 {
#[cfg(test)]
pub(crate) mod test {
pub use super::minmax;
pub use super::serialize::test::fixed_size_test;
use super::{compute_num_bits, f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use proptest::prelude::*;
use std::f64;
use tantivy_bitpacker::compute_num_bits;
pub use tantivy_bitpacker::minmax;
fn test_i64_converter_helper(val: i64) {
assert_eq!(u64_to_i64(i64_to_u64(val)), val);

View File

@@ -35,6 +35,10 @@ impl BytesFastFieldWriter {
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.vals.capacity() + self.doc_index.capacity() * std::mem::size_of::<u64>()
}
/// Access the field associated to the `BytesFastFieldWriter`
pub fn field(&self) -> Field {
self.field

View File

@@ -8,6 +8,7 @@ use crate::DocId;
use fnv::FnvHashMap;
use std::io;
use std::iter::once;
use tantivy_bitpacker::minmax;
/// Writer for multi-valued (as in, more than one value per document)
/// int fast field.
@@ -48,6 +49,12 @@ impl MultiValuedFastFieldWriter {
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.vals.capacity() * std::mem::size_of::<UnorderedTermId>()
+ self.doc_index.capacity() * std::mem::size_of::<u64>()
}
/// Access the field associated to the `MultiValuedFastFieldWriter`
pub fn field(&self) -> Field {
self.field
@@ -148,7 +155,7 @@ impl MultiValuedFastFieldWriter {
}
}
None => {
let val_min_max = crate::common::minmax(self.vals.iter().cloned());
let val_min_max = minmax(self.vals.iter().cloned());
let (val_min, val_max) = val_min_max.unwrap_or((0u64, 0u64));
value_serializer =
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;

View File

@@ -1,9 +1,8 @@
use super::FastValue;
use crate::common::bitpacker::BitUnpacker;
use crate::common::compute_num_bits;
use crate::common::BinarySerializable;
use crate::common::CompositeFile;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::{FastFieldSerializer, FastFieldsWriter};
use crate::schema::Schema;
@@ -12,6 +11,8 @@ use crate::DocId;
use std::collections::HashMap;
use std::marker::PhantomData;
use std::path::Path;
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitUnpacker;
/// Trait for accessing a fastfield.
///
@@ -19,6 +20,7 @@ use std::path::Path;
/// fast field is required.
#[derive(Clone)]
pub struct FastFieldReader<Item: FastValue> {
bytes: OwnedBytes,
bit_unpacker: BitUnpacker,
min_value_u64: u64,
max_value_u64: u64,
@@ -33,8 +35,9 @@ impl<Item: FastValue> FastFieldReader<Item> {
let amplitude = u64::deserialize(&mut bytes)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(bytes, num_bits);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(FastFieldReader {
bytes,
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
@@ -55,7 +58,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
}
pub(crate) fn get_u64(&self, doc: u64) -> Item {
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc))
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes))
}
/// Internally `multivalued` also use SingleValue Fast fields.

View File

@@ -1,11 +1,11 @@
use crate::common::bitpacker::BitPacker;
use crate::common::compute_num_bits;
use crate::common::BinarySerializable;
use crate::common::CompositeWrite;
use crate::common::CountingWriter;
use crate::directory::WritePtr;
use crate::schema::Field;
use std::io::{self, Write};
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitPacker;
/// `FastFieldSerializer` is in charge of serializing
/// fastfields on disk.

View File

@@ -1,7 +1,5 @@
use super::multivalued::MultiValuedFastFieldWriter;
use crate::common;
use crate::common::BinarySerializable;
use crate::common::VInt;
use crate::fastfield::{BytesFastFieldWriter, FastFieldSerializer};
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
@@ -9,6 +7,7 @@ use crate::termdict::TermOrdinal;
use fnv::FnvHashMap;
use std::collections::HashMap;
use std::io;
use tantivy_bitpacker::BlockedBitpacker;
/// The fastfieldswriter regroup all of the fast field writers.
pub struct FastFieldsWriter {
@@ -72,6 +71,24 @@ impl FastFieldsWriter {
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.single_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.multi_values_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.bytes_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
}
/// Get the `FastFieldWriter` associated to a field.
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
// TODO optimize
@@ -157,7 +174,7 @@ impl FastFieldsWriter {
/// using `common::i64_to_u64` and `common::f64_to_u64`.
pub struct IntFastFieldWriter {
field: Field,
vals: Vec<u8>,
vals: BlockedBitpacker,
val_count: usize,
val_if_missing: u64,
val_min: u64,
@@ -169,7 +186,7 @@ impl IntFastFieldWriter {
pub fn new(field: Field) -> IntFastFieldWriter {
IntFastFieldWriter {
field,
vals: Vec::new(),
vals: BlockedBitpacker::new(),
val_count: 0,
val_if_missing: 0u64,
val_min: u64::max_value(),
@@ -177,6 +194,11 @@ impl IntFastFieldWriter {
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.vals.mem_usage()
}
/// Returns the field that this writer is targetting.
pub fn field(&self) -> Field {
self.field
@@ -196,9 +218,7 @@ impl IntFastFieldWriter {
/// associated to the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: u64) {
VInt(val)
.serialize(&mut self.vals)
.expect("unable to serialize VInt to Vec");
self.vals.add(val);
if val > self.val_max {
self.val_max = val;
@@ -244,8 +264,7 @@ impl IntFastFieldWriter {
let mut single_field_serializer = serializer.new_u64_fast_field(self.field, min, max)?;
let mut cursor = self.vals.as_slice();
while let Ok(VInt(val)) = VInt::deserialize(&mut cursor) {
for val in self.vals.iter() {
single_field_serializer.add_val(val)?;
}

View File

@@ -50,6 +50,13 @@ impl FieldNormsWriter {
}
}
/// The memory used inclusive childs
pub fn mem_usage(&self) -> usize {
self.fieldnorms_buffer
.iter()
.map(|buf| buf.capacity())
.sum()
}
/// Ensure that all documents in 0..max_doc have a byte associated with them
/// in each of the fieldnorm vectors.
///

View File

@@ -1,3 +1,5 @@
use tantivy_bitpacker::minmax;
use crate::common::MAX_DOC_LIMIT;
use crate::core::Segment;
use crate::core::SegmentReader;
@@ -70,7 +72,7 @@ fn compute_min_max_val(
Some(delete_bitset) => {
// some deleted documents,
// we need to recompute the max / min
crate::common::minmax(
minmax(
(0..max_doc)
.filter(|doc_id| delete_bitset.is_alive(*doc_id))
.map(|doc_id| u64_reader.get(doc_id)),

View File

@@ -36,6 +36,11 @@ impl SegmentSerializer {
})
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.store_writer.mem_usage()
}
pub fn segment(&self) -> &Segment {
&self.segment
}

View File

@@ -111,6 +111,9 @@ impl SegmentWriter {
pub fn mem_usage(&self) -> usize {
self.multifield_postings.mem_usage()
+ self.fieldnorms_writer.mem_usage()
+ self.fast_field_writers.mem_usage()
+ self.segment_serializer.mem_usage()
}
/// Indexes a new document

View File

@@ -45,6 +45,11 @@ impl StoreWriter {
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.intermediary_buffer.capacity() + self.current_block.capacity()
}
/// Store a new document.
///
/// The document id is implicitely the number of times

View File

@@ -1,11 +1,12 @@
use crate::common::compute_num_bits;
use crate::common::{bitpacker::BitPacker, BinarySerializable, FixedSize};
use crate::common::{BinarySerializable, FixedSize};
use crate::directory::{FileSlice, OwnedBytes};
use crate::postings::TermInfo;
use crate::termdict::TermOrdinal;
use byteorder::{ByteOrder, LittleEndian};
use std::cmp;
use std::io::{self, Read, Write};
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitPacker;
const BLOCK_LEN: usize = 256;
@@ -290,11 +291,11 @@ mod tests {
use super::TermInfoBlockMeta;
use super::{TermInfoStore, TermInfoStoreWriter};
use crate::common;
use crate::common::bitpacker::BitPacker;
use crate::common::compute_num_bits;
use crate::common::BinarySerializable;
use crate::directory::FileSlice;
use crate::postings::TermInfo;
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitPacker;
#[test]
fn test_term_info_block() {