mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 16:22:55 +00:00
144 lines
4.2 KiB
Rust
144 lines
4.2 KiB
Rust
pub mod bitpacker;
|
|
mod bitset;
|
|
mod composite_file;
|
|
mod counting_writer;
|
|
mod serialize;
|
|
mod vint;
|
|
|
|
pub use self::bitset::BitSet;
|
|
pub(crate) use self::bitset::TinySet;
|
|
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
|
pub use self::counting_writer::CountingWriter;
|
|
pub use self::serialize::{BinarySerializable, FixedSize};
|
|
pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
|
|
pub use byteorder::LittleEndian as Endianness;
|
|
|
|
|
|
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
|
///
|
|
/// We do not allow segments with more than
|
|
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
|
|
|
|
/// Computes the number of bits that will be used for bitpacking.
|
|
///
|
|
/// In general the target is the minimum number of bits
|
|
/// required to express the amplitude given in argument.
|
|
///
|
|
/// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
|
|
///
|
|
/// The logic is slightly more convoluted here as for optimization
|
|
/// reasons, we want to ensure that a value spawns over at most 8 bytes
|
|
/// of aligns bytes.
|
|
///
|
|
/// Spanning over 9 bytes is possible for instance, if we do
|
|
/// bitpacking with an amplitude of 63 bits.
|
|
/// In this case, the second int will start on bit
|
|
/// 63 (which belongs to byte 7) and ends at byte 15;
|
|
/// Hence 9 bytes (from byte 7 to byte 15 included).
|
|
///
|
|
/// To avoid this, we force the number of bits to 64bits
|
|
/// when the result is greater than `64-8 = 56 bits`.
|
|
///
|
|
/// Note that this only affects rare use cases spawning over
|
|
/// a very large range of values. Even in this case, it results
|
|
/// in an extra cost of at most 12% compared to the optimal
|
|
/// number of bits.
|
|
pub(crate) fn compute_num_bits(n: u64) -> u8 {
|
|
let amplitude = (64u32 - n.leading_zeros()) as u8;
|
|
if amplitude <= 64 - 8 {
|
|
amplitude
|
|
} else {
|
|
64
|
|
}
|
|
}
|
|
|
|
pub(crate) fn is_power_of_2(n: usize) -> bool {
|
|
(n > 0) && (n & (n - 1) == 0)
|
|
}
|
|
|
|
/// Has length trait
|
|
pub trait HasLen {
|
|
/// Return length
|
|
fn len(&self) -> usize;
|
|
|
|
/// Returns true iff empty.
|
|
fn is_empty(&self) -> bool {
|
|
self.len() == 0
|
|
}
|
|
}
|
|
|
|
const HIGHEST_BIT: u64 = 1 << 63;
|
|
|
|
/// Maps a `i64` to `u64`
|
|
///
|
|
/// For simplicity, tantivy internally handles `i64` as `u64`.
|
|
/// The mapping is defined by this function.
|
|
///
|
|
/// Maps `i64` to `u64` so that
|
|
/// `-2^63 .. 2^63-1` is mapped
|
|
/// to
|
|
/// `0 .. 2^64-1`
|
|
/// in that order.
|
|
///
|
|
/// This is more suited than simply casting (`val as u64`)
|
|
/// because of bitpacking.
|
|
///
|
|
/// Imagine a list of `i64` ranging from -10 to 10.
|
|
/// When casting negative values, the negative values are projected
|
|
/// to values over 2^63, and all values end up requiring 64 bits.
|
|
///
|
|
/// # See also
|
|
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
|
|
#[inline(always)]
|
|
pub fn i64_to_u64(val: i64) -> u64 {
|
|
(val as u64) ^ HIGHEST_BIT
|
|
}
|
|
|
|
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
|
#[inline(always)]
|
|
pub fn u64_to_i64(val: u64) -> i64 {
|
|
(val ^ HIGHEST_BIT) as i64
|
|
}
|
|
|
|
#[cfg(test)]
|
|
pub(crate) mod test {
|
|
|
|
pub use super::serialize::test::fixed_size_test;
|
|
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
|
|
|
|
fn test_i64_converter_helper(val: i64) {
|
|
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
|
}
|
|
|
|
#[test]
|
|
fn test_i64_converter() {
|
|
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
|
|
assert_eq!(i64_to_u64(i64::max_value()), u64::max_value());
|
|
test_i64_converter_helper(0i64);
|
|
test_i64_converter_helper(i64::min_value());
|
|
test_i64_converter_helper(i64::max_value());
|
|
for i in -1000i64..1000i64 {
|
|
test_i64_converter_helper(i);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_compute_num_bits() {
|
|
assert_eq!(compute_num_bits(1), 1u8);
|
|
assert_eq!(compute_num_bits(0), 0u8);
|
|
assert_eq!(compute_num_bits(2), 2u8);
|
|
assert_eq!(compute_num_bits(3), 2u8);
|
|
assert_eq!(compute_num_bits(4), 3u8);
|
|
assert_eq!(compute_num_bits(255), 8u8);
|
|
assert_eq!(compute_num_bits(256), 9u8);
|
|
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
|
}
|
|
|
|
#[test]
|
|
fn test_max_doc() {
|
|
// this is the first time I write a unit test for a constant.
|
|
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
|
|
assert!((super::MAX_DOC_LIMIT as i32) < 0);
|
|
}
|
|
}
|