mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-23 19:50:42 +00:00
* fix clippy * fix clippy fastfield codecs * fix clippy bitpacker * fix clippy common * fix clippy stacker * fix clippy sstable * fmt
327 lines
10 KiB
Rust
327 lines
10 KiB
Rust
use std::mem;
|
|
|
|
use common::serialize_vint_u32;
|
|
|
|
use crate::memory_arena::{load, store};
|
|
use crate::{Addr, MemoryArena};
|
|
|
|
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
|
|
const FIRST_BLOCK: usize = 16;
|
|
const INLINED_BLOCK_LEN: usize = FIRST_BLOCK + mem::size_of::<Addr>();
|
|
|
|
enum CapacityResult {
|
|
Available(u32),
|
|
NeedAlloc(u32),
|
|
}
|
|
|
|
fn len_to_capacity(len: u32) -> CapacityResult {
|
|
match len {
|
|
0..=15 => CapacityResult::Available(FIRST_BLOCK as u32 - len),
|
|
16..=MAX_BLOCK_LEN => {
|
|
let cap = 1 << (32u32 - (len - 1u32).leading_zeros());
|
|
let available = cap - len;
|
|
if available == 0 {
|
|
CapacityResult::NeedAlloc(len)
|
|
} else {
|
|
CapacityResult::Available(available)
|
|
}
|
|
}
|
|
n => {
|
|
let available = n % MAX_BLOCK_LEN;
|
|
if available == 0 {
|
|
CapacityResult::NeedAlloc(MAX_BLOCK_LEN)
|
|
} else {
|
|
CapacityResult::Available(MAX_BLOCK_LEN - available)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// An exponential unrolled link.
|
|
///
|
|
/// The use case is as follows. Tantivy's indexer conceptually acts like a
|
|
/// `HashMap<Term, Vec<u32>>`. As we come across a given term in document
|
|
/// `D`, we lookup the term in the map and append the document id to its vector.
|
|
///
|
|
/// The vector is then only read when it is serialized.
|
|
///
|
|
/// The `ExpUnrolledLinkedList` offers a more efficient solution to this
|
|
/// problem.
|
|
///
|
|
/// It combines the idea of the unrolled linked list and tries to address the
|
|
/// problem of selecting an adequate block size using a strategy similar to
|
|
/// that of the `Vec` amortized resize strategy.
|
|
///
|
|
/// Data is stored in a linked list of blocks. The first block has a size of `4`
|
|
/// and each block has a length of twice that of the previous block up to
|
|
/// `MAX_BLOCK_LEN = 32768`.
|
|
///
|
|
/// This strategy is a good trade off to handle numerous very rare terms
|
|
/// and avoid wasting half of the memory for very frequent terms.
|
|
#[derive(Debug, Clone, Copy)]
|
|
pub struct ExpUnrolledLinkedList {
|
|
len: u32,
|
|
tail: Addr,
|
|
inlined_data: [u8; INLINED_BLOCK_LEN],
|
|
}
|
|
|
|
pub struct ExpUnrolledLinkedListWriter<'a> {
|
|
eull: &'a mut ExpUnrolledLinkedList,
|
|
arena: &'a mut MemoryArena,
|
|
}
|
|
|
|
fn ensure_capacity<'a>(
|
|
eull: &'a mut ExpUnrolledLinkedList,
|
|
arena: &'a mut MemoryArena,
|
|
) -> &'a mut [u8] {
|
|
if eull.len <= FIRST_BLOCK as u32 {
|
|
// We are still hitting the inline block.
|
|
if eull.len < FIRST_BLOCK as u32 {
|
|
return &mut eull.inlined_data[eull.len as usize..FIRST_BLOCK];
|
|
}
|
|
// We need to allocate a new block!
|
|
let new_block_addr: Addr = arena.allocate_space(FIRST_BLOCK + mem::size_of::<Addr>());
|
|
store(&mut eull.inlined_data[FIRST_BLOCK..], new_block_addr);
|
|
eull.tail = new_block_addr;
|
|
return arena.slice_mut(eull.tail, FIRST_BLOCK);
|
|
}
|
|
let len = match len_to_capacity(eull.len) {
|
|
CapacityResult::NeedAlloc(new_block_len) => {
|
|
let new_block_addr: Addr =
|
|
arena.allocate_space(new_block_len as usize + mem::size_of::<Addr>());
|
|
arena.write_at(eull.tail, new_block_addr);
|
|
eull.tail = new_block_addr;
|
|
new_block_len
|
|
}
|
|
CapacityResult::Available(available) => available,
|
|
};
|
|
arena.slice_mut(eull.tail, len as usize)
|
|
}
|
|
|
|
impl<'a> ExpUnrolledLinkedListWriter<'a> {
|
|
pub fn write_u32_vint(&mut self, val: u32) {
|
|
let mut buf = [0u8; 8];
|
|
let data = serialize_vint_u32(val, &mut buf);
|
|
self.extend_from_slice(data);
|
|
}
|
|
|
|
pub fn extend_from_slice(&mut self, mut buf: &[u8]) {
|
|
while !buf.is_empty() {
|
|
let add_len: usize;
|
|
{
|
|
let output_buf = ensure_capacity(self.eull, self.arena);
|
|
add_len = buf.len().min(output_buf.len());
|
|
output_buf[..add_len].copy_from_slice(&buf[..add_len]);
|
|
}
|
|
self.eull.len += add_len as u32;
|
|
self.eull.tail = self.eull.tail.offset(add_len as u32);
|
|
buf = &buf[add_len..];
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Default for ExpUnrolledLinkedList {
|
|
fn default() -> ExpUnrolledLinkedList {
|
|
ExpUnrolledLinkedList {
|
|
len: 0u32,
|
|
tail: Addr::null_pointer(),
|
|
inlined_data: [0u8; INLINED_BLOCK_LEN],
|
|
}
|
|
}
|
|
}
|
|
|
|
impl ExpUnrolledLinkedList {
|
|
#[inline]
|
|
pub fn writer<'a>(&'a mut self, arena: &'a mut MemoryArena) -> ExpUnrolledLinkedListWriter<'a> {
|
|
ExpUnrolledLinkedListWriter { eull: self, arena }
|
|
}
|
|
|
|
pub fn read_to_end(&self, arena: &MemoryArena, output: &mut Vec<u8>) {
|
|
let len = self.len as usize;
|
|
if len <= FIRST_BLOCK {
|
|
output.extend_from_slice(&self.inlined_data[..len]);
|
|
return;
|
|
}
|
|
output.extend_from_slice(&self.inlined_data[..FIRST_BLOCK]);
|
|
let mut cur = FIRST_BLOCK;
|
|
let mut addr = load(&self.inlined_data[FIRST_BLOCK..]);
|
|
loop {
|
|
let cap = match len_to_capacity(cur as u32) {
|
|
CapacityResult::Available(capacity) => capacity,
|
|
CapacityResult::NeedAlloc(capacity) => capacity,
|
|
} as usize;
|
|
let data = arena.slice(addr, cap);
|
|
if cur + cap >= len {
|
|
output.extend_from_slice(&data[..(len - cur)]);
|
|
return;
|
|
}
|
|
output.extend_from_slice(data);
|
|
cur += cap;
|
|
addr = arena.read(addr.offset(cap as u32));
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use common::{read_u32_vint, write_u32_vint};
|
|
|
|
use super::super::MemoryArena;
|
|
use super::{len_to_capacity, *};
|
|
|
|
#[test]
|
|
fn test_eull() {
|
|
let mut arena = MemoryArena::default();
|
|
let mut stack = ExpUnrolledLinkedList::default();
|
|
stack.writer(&mut arena).extend_from_slice(&[1u8]);
|
|
stack.writer(&mut arena).extend_from_slice(&[2u8]);
|
|
stack.writer(&mut arena).extend_from_slice(&[3u8, 4u8]);
|
|
stack.writer(&mut arena).extend_from_slice(&[5u8]);
|
|
{
|
|
let mut buffer = Vec::new();
|
|
stack.read_to_end(&arena, &mut buffer);
|
|
assert_eq!(&buffer[..], &[1u8, 2u8, 3u8, 4u8, 5u8]);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_eull_long() {
|
|
let mut arena = MemoryArena::default();
|
|
let mut eull = ExpUnrolledLinkedList::default();
|
|
let data: Vec<u32> = (0..100).collect();
|
|
for &el in &data {
|
|
eull.writer(&mut arena).write_u32_vint(el);
|
|
}
|
|
let mut buffer = Vec::new();
|
|
eull.read_to_end(&arena, &mut buffer);
|
|
let mut result = vec![];
|
|
let mut remaining = &buffer[..];
|
|
while !remaining.is_empty() {
|
|
result.push(read_u32_vint(&mut remaining));
|
|
}
|
|
assert_eq!(&result[..], &data[..]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_eull_interlaced() {
|
|
let mut eull = MemoryArena::default();
|
|
let mut stack = ExpUnrolledLinkedList::default();
|
|
let mut stack2 = ExpUnrolledLinkedList::default();
|
|
|
|
let mut vec1: Vec<u8> = vec![];
|
|
let mut vec2: Vec<u8> = vec![];
|
|
|
|
for i in 0..9 {
|
|
stack.writer(&mut eull).write_u32_vint(i);
|
|
assert!(write_u32_vint(i, &mut vec1).is_ok());
|
|
if i % 2 == 0 {
|
|
stack2.writer(&mut eull).write_u32_vint(i);
|
|
assert!(write_u32_vint(i, &mut vec2).is_ok());
|
|
}
|
|
}
|
|
let mut res1 = vec![];
|
|
let mut res2 = vec![];
|
|
stack.read_to_end(&eull, &mut res1);
|
|
stack2.read_to_end(&eull, &mut res2);
|
|
assert_eq!(&vec1[..], &res1[..]);
|
|
assert_eq!(&vec2[..], &res2[..]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_jump_if_needed() {
|
|
let mut available = 16u32;
|
|
for i in 0..10_000_000 {
|
|
match len_to_capacity(i) {
|
|
CapacityResult::NeedAlloc(cap) => {
|
|
assert_eq!(available, 0, "Failed len={}: Expected 0 got {}", i, cap);
|
|
available = cap;
|
|
}
|
|
CapacityResult::Available(cap) => {
|
|
assert_eq!(
|
|
available, cap,
|
|
"Failed len={}: Expected {} Got {}",
|
|
i, available, cap
|
|
);
|
|
}
|
|
}
|
|
available -= 1;
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_jump_if_needed_progression() {
|
|
let mut v = vec![];
|
|
for i in 0.. {
|
|
if v.len() >= 10 {
|
|
break;
|
|
}
|
|
if let CapacityResult::NeedAlloc(cap) = len_to_capacity(i) {
|
|
v.push((i, cap));
|
|
}
|
|
}
|
|
assert_eq!(
|
|
&v[..],
|
|
&[
|
|
(16, 16),
|
|
(32, 32),
|
|
(64, 64),
|
|
(128, 128),
|
|
(256, 256),
|
|
(512, 512),
|
|
(1024, 1024),
|
|
(2048, 2048),
|
|
(4096, 4096),
|
|
(8192, 8192)
|
|
]
|
|
);
|
|
}
|
|
}
|
|
|
|
#[cfg(all(test, feature = "unstable"))]
|
|
mod bench {
|
|
use std::iter;
|
|
|
|
use test::Bencher;
|
|
|
|
use super::super::MemoryArena;
|
|
use super::ExpUnrolledLinkedList;
|
|
|
|
const NUM_STACK: usize = 10_000;
|
|
const STACK_SIZE: u32 = 1000;
|
|
|
|
#[bench]
|
|
fn bench_push_vec(bench: &mut Bencher) {
|
|
bench.iter(|| {
|
|
let mut vecs = Vec::with_capacity(100);
|
|
for _ in 0..NUM_STACK {
|
|
vecs.push(Vec::new());
|
|
}
|
|
for s in 0..NUM_STACK {
|
|
for i in 0u32..STACK_SIZE {
|
|
let t = s * 392017 % NUM_STACK;
|
|
vecs[t].push(i);
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
#[bench]
|
|
fn bench_push_stack(bench: &mut Bencher) {
|
|
bench.iter(|| {
|
|
let mut arena = MemoryArena::default();
|
|
let mut stacks: Vec<ExpUnrolledLinkedList> =
|
|
iter::repeat_with(ExpUnrolledLinkedList::default)
|
|
.take(NUM_STACK)
|
|
.collect();
|
|
for s in 0..NUM_STACK {
|
|
for i in 0u32..STACK_SIZE {
|
|
let t = s * 392017 % NUM_STACK;
|
|
stacks[t]
|
|
.writer(&mut arena)
|
|
.extend_from_slice(&i.to_ne_bytes());
|
|
}
|
|
}
|
|
});
|
|
}
|
|
}
|