mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
ISSUE-8 Replacing Vec by an Unrolled Linked List.
This commit is contained in:
@@ -5,6 +5,7 @@ use directory::error::{FileError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use std::result;
|
||||
use std::io;
|
||||
use std::marker::Sync;
|
||||
|
||||
/// Write-once read many (WORM) abstraction for where tantivy's index should be stored.
|
||||
///
|
||||
@@ -15,7 +16,7 @@ use std::io;
|
||||
/// - The [RAMDirectory](struct.RAMDirectory.html), which
|
||||
/// should be used mostly for tests.
|
||||
///
|
||||
pub trait Directory: fmt::Debug + Send + 'static {
|
||||
pub trait Directory: fmt::Debug + Send + Sync + 'static {
|
||||
|
||||
/// Opens a virtual file for read.
|
||||
///
|
||||
|
||||
@@ -28,6 +28,7 @@ pub struct IndexWriter {
|
||||
target_num_docs: usize,
|
||||
num_threads: usize,
|
||||
docstamp: u64,
|
||||
|
||||
}
|
||||
|
||||
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
pub mod index_writer;
|
||||
mod index_writer;
|
||||
pub mod segment_serializer;
|
||||
pub mod merger;
|
||||
|
||||
|
||||
11
src/lib.rs
11
src/lib.rs
@@ -1,11 +1,7 @@
|
||||
/*!
|
||||
|
||||
# Creating a new index, adding documents and searching.
|
||||
Tantivy is a search engine library.
|
||||
|
||||
```
|
||||
|
||||
|
||||
```
|
||||
*/
|
||||
|
||||
#![feature(binary_heap_extras)]
|
||||
@@ -71,6 +67,7 @@ pub mod schema;
|
||||
pub use directory::Directory;
|
||||
pub use core::searcher::Searcher;
|
||||
pub use core::Index;
|
||||
pub use indexer::IndexWriter;
|
||||
pub use schema::Term;
|
||||
pub use schema::Document;
|
||||
pub use core::SegmentReader;
|
||||
@@ -158,10 +155,6 @@ mod tests {
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
// TODO reenable this test
|
||||
// let segment = commit_result.unwrap();
|
||||
// let segment_reader = SegmentReader::open(segment).unwrap();
|
||||
// assert_eq!(segment_reader.max_doc(), 3);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
265
src/postings/block_appender.rs
Normal file
265
src/postings/block_appender.rs
Normal file
@@ -0,0 +1,265 @@
|
||||
use compression::NUM_DOCS_PER_BLOCK;
|
||||
use DocId;
|
||||
|
||||
const BLOCK_SIZE: u32 = NUM_DOCS_PER_BLOCK as u32;
|
||||
|
||||
struct Block {
|
||||
data: [u32; BLOCK_SIZE as usize],
|
||||
next: u32,
|
||||
}
|
||||
|
||||
impl Block {
|
||||
fn new() -> Block {
|
||||
Block {
|
||||
data: [0u32; BLOCK_SIZE as usize],
|
||||
next: u32::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
struct ListInfo {
|
||||
first: u32,
|
||||
last: u32,
|
||||
len: u32,
|
||||
}
|
||||
|
||||
pub struct BlockStore {
|
||||
lists: Vec<ListInfo>,
|
||||
blocks: Vec<Block>,
|
||||
free_blocks: Vec<u32>,
|
||||
}
|
||||
|
||||
impl BlockStore {
|
||||
pub fn allocate(num_blocks: usize) -> BlockStore {
|
||||
BlockStore {
|
||||
lists: Vec::with_capacity(1_000_000),
|
||||
blocks: (0 .. num_blocks).map(|_| Block::new()).collect(),
|
||||
free_blocks: (0u32 .. num_blocks as u32).collect()
|
||||
}
|
||||
}
|
||||
|
||||
fn new_list(&mut self, first_el: u32) -> u32 {
|
||||
let res = self.lists.len() as u32;
|
||||
let new_block_id = self.new_block().unwrap();
|
||||
self.blocks[new_block_id as usize].data[0] = first_el;
|
||||
self.lists.push(ListInfo {
|
||||
first: new_block_id,
|
||||
last: new_block_id,
|
||||
len: 1,
|
||||
});
|
||||
res
|
||||
}
|
||||
|
||||
fn new_block(&mut self,) -> Option<u32> {
|
||||
self.free_blocks.pop()
|
||||
.map(|block_id| {
|
||||
self.blocks[block_id as usize].next = u32::max_value();
|
||||
block_id
|
||||
})
|
||||
}
|
||||
|
||||
fn get_list_info(&mut self, list_id: u32) -> &mut ListInfo {
|
||||
&mut self.lists[list_id as usize]
|
||||
}
|
||||
|
||||
|
||||
fn block_id_to_append(&mut self, list_id: u32) -> u32 {
|
||||
let list_info: ListInfo = self.lists[list_id as usize];
|
||||
// get_list_info(list_id).len % BLOCK_SIZE == 0;
|
||||
// let new_block_required: bool = self.get_list_info(list_id).len % BLOCK_SIZE == 0;
|
||||
if list_info.len % BLOCK_SIZE == 0 {
|
||||
// we need to add a fresh new block.
|
||||
let new_block_id: u32 = { self.new_block().unwrap() };
|
||||
let last_block_id: usize;
|
||||
{
|
||||
// update the list info.
|
||||
let list_info: &mut ListInfo = self.get_list_info(list_id);
|
||||
last_block_id = list_info.last as usize;
|
||||
list_info.last = new_block_id;
|
||||
}
|
||||
self.blocks[last_block_id].next = new_block_id;
|
||||
new_block_id
|
||||
}
|
||||
else {
|
||||
list_info.last
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push(&mut self, list_id: u32, val: u32) {
|
||||
let new_block_required: bool = self.get_list_info(list_id).len % BLOCK_SIZE == 0;
|
||||
let block_id: u32 = self.block_id_to_append(list_id);
|
||||
let list_len: u32;
|
||||
{
|
||||
let list_info: &mut ListInfo = self.get_list_info(list_id);
|
||||
list_len = list_info.len;
|
||||
list_info.len += 1u32;
|
||||
}
|
||||
self.blocks[block_id as usize].data[(list_len % BLOCK_SIZE) as usize] = val;
|
||||
}
|
||||
|
||||
pub fn iter_list(&self, list_id: u32) -> BlockIterator {
|
||||
let list_info = &self.lists[list_id as usize];
|
||||
BlockIterator {
|
||||
current_block: &self.blocks[list_info.first as usize],
|
||||
blocks: &self.blocks,
|
||||
cursor: 0,
|
||||
len: list_info.len as usize,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct BlockIterator<'a> {
|
||||
current_block: &'a Block,
|
||||
blocks: &'a [Block],
|
||||
cursor: usize,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
|
||||
impl<'a> Iterator for BlockIterator<'a> {
|
||||
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.cursor == self.len {
|
||||
None
|
||||
}
|
||||
else {
|
||||
let res = self.current_block.data[self.cursor % (BLOCK_SIZE as usize)];
|
||||
self.cursor += 1;
|
||||
if self.cursor % (BLOCK_SIZE as usize) == 0 {
|
||||
self.current_block = &self.blocks[self.current_block.next as usize];
|
||||
}
|
||||
Some(res)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
pub struct BlockAppender {
|
||||
blocks: Vec<Box<[DocId; NUM_DOCS_PER_BLOCK]>>,
|
||||
doc_freq: usize,
|
||||
}
|
||||
|
||||
impl BlockAppender {
|
||||
|
||||
pub fn new() -> BlockAppender {
|
||||
BlockAppender {
|
||||
blocks: Vec::new(),
|
||||
doc_freq: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push(&mut self, doc_id: DocId) {
|
||||
if self.doc_freq % NUM_DOCS_PER_BLOCK == 0 {
|
||||
self.blocks.push(Box::new([0u32; NUM_DOCS_PER_BLOCK ]));
|
||||
}
|
||||
self.blocks[self.doc_freq / NUM_DOCS_PER_BLOCK][self.doc_freq % NUM_DOCS_PER_BLOCK] = doc_id;
|
||||
self.doc_freq += 1;
|
||||
}
|
||||
|
||||
pub fn last(&self) -> Option<DocId> {
|
||||
if self.doc_freq == 0 {
|
||||
return None
|
||||
}
|
||||
else {
|
||||
Some(self.get(self.doc_freq - 1))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(&self,) -> usize {
|
||||
self.doc_freq
|
||||
}
|
||||
|
||||
|
||||
pub fn get(&self, cursor: usize) -> DocId {
|
||||
self.blocks[cursor / NUM_DOCS_PER_BLOCK][cursor % NUM_DOCS_PER_BLOCK]
|
||||
}
|
||||
|
||||
|
||||
pub fn iter(&self,) -> IterBlockAppender {
|
||||
IterBlockAppender {
|
||||
cursor: 0,
|
||||
block_appender: &self,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
pub struct IterBlockAppender<'a> {
|
||||
cursor: usize,
|
||||
block_appender: &'a BlockAppender,
|
||||
}
|
||||
|
||||
|
||||
impl<'a> Iterator for IterBlockAppender<'a> {
|
||||
|
||||
type Item = DocId;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.cursor == self.block_appender.doc_freq {
|
||||
return None
|
||||
}
|
||||
else {
|
||||
let res = self.block_appender.get(self.cursor);
|
||||
self.cursor += 1;
|
||||
Some(res)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
use super::*;
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_block_store() {
|
||||
let mut block_store = BlockStore::allocate(1_000);
|
||||
let list_2 = block_store.new_list(0);
|
||||
let list_3 = block_store.new_list(0);
|
||||
let list_4 = block_store.new_list(0);
|
||||
let list_5 = block_store.new_list(0);
|
||||
for i in 1 .. 2_000 {
|
||||
block_store.push(list_2, i * 2);
|
||||
block_store.push(list_3, i * 3);
|
||||
}
|
||||
for i in 1 .. 10 {
|
||||
block_store.push(list_4, i * 4);
|
||||
block_store.push(list_5, i * 5);
|
||||
}
|
||||
|
||||
let mut list2_iter = block_store.iter_list(list_2);
|
||||
let mut list3_iter = block_store.iter_list(list_3);
|
||||
let mut list4_iter = block_store.iter_list(list_4);
|
||||
let mut list5_iter = block_store.iter_list(list_5);
|
||||
for i in 0 .. 2_000 {
|
||||
assert_eq!(list2_iter.next().unwrap(), i * 2);
|
||||
assert_eq!(list3_iter.next().unwrap(), i * 3);
|
||||
|
||||
}
|
||||
assert!(list2_iter.next().is_none());
|
||||
assert!(list3_iter.next().is_none());
|
||||
for i in 0 .. 10 {
|
||||
assert_eq!(list4_iter.next().unwrap(), i * 4);
|
||||
assert_eq!(list5_iter.next().unwrap(), i * 5);
|
||||
}
|
||||
assert!(list4_iter.next().is_none());
|
||||
assert!(list5_iter.next().is_none());
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
mod postings;
|
||||
mod recorder;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
mod postings_writer;
|
||||
mod term_info;
|
||||
mod chained_postings;
|
||||
mod vec_postings;
|
||||
@@ -12,13 +12,14 @@ mod freq_handler;
|
||||
mod docset;
|
||||
mod scored_docset;
|
||||
mod segment_postings_option;
|
||||
mod block_appender;
|
||||
|
||||
pub use self::docset::{SkipResult, DocSet};
|
||||
pub use self::offset_postings::OffsetPostings;
|
||||
pub use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
|
||||
pub use self::serializer::PostingsSerializer;
|
||||
pub use self::writer::PostingsWriter;
|
||||
pub use self::writer::SpecializedPostingsWriter;
|
||||
pub use self::postings_writer::PostingsWriter;
|
||||
pub use self::postings_writer::SpecializedPostingsWriter;
|
||||
pub use self::term_info::TermInfo;
|
||||
pub use self::postings::Postings;
|
||||
pub use self::vec_postings::VecPostings;
|
||||
|
||||
@@ -4,17 +4,18 @@ use schema::Term;
|
||||
use postings::PostingsSerializer;
|
||||
use std::io;
|
||||
use postings::Recorder;
|
||||
use postings::block_appender::BlockAppender;
|
||||
|
||||
|
||||
struct TermPostingsWriter<Rec: Recorder + 'static> {
|
||||
doc_ids: Vec<DocId>,
|
||||
doc_ids: BlockAppender,
|
||||
recorder: Rec,
|
||||
}
|
||||
|
||||
impl<Rec: Recorder + 'static> TermPostingsWriter<Rec> {
|
||||
pub fn new() -> TermPostingsWriter<Rec> {
|
||||
TermPostingsWriter {
|
||||
doc_ids: Vec::new(),
|
||||
doc_ids: BlockAppender::new(),
|
||||
recorder: Recorder::new(),
|
||||
}
|
||||
}
|
||||
@@ -29,7 +30,7 @@ impl<Rec: Recorder + 'static> TermPostingsWriter<Rec> {
|
||||
|
||||
pub fn suscribe(&mut self, doc: DocId, pos: u32) {
|
||||
match self.doc_ids.last() {
|
||||
Some(&last_doc) => {
|
||||
Some(last_doc) => {
|
||||
if last_doc != doc {
|
||||
self.close_doc();
|
||||
self.doc_ids.push(doc);
|
||||
@@ -12,7 +12,7 @@ use itertools::Itertools;
|
||||
|
||||
/// Documents are really just a list of couple `(field, value)`.
|
||||
/// In this list, one field may appear more than once.
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, RustcEncodable, RustcDecodable)]
|
||||
pub struct Document {
|
||||
field_values: Vec<FieldValue>,
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::io::Write;
|
||||
use std::io::Read;
|
||||
use common::BinarySerializable;
|
||||
|
||||
#[derive(Copy,Clone,Debug,PartialEq,PartialOrd,Eq,Ord,Hash)]
|
||||
#[derive(Copy,Clone,Debug,PartialEq,PartialOrd,Eq,Ord,Hash, RustcEncodable, RustcDecodable)]
|
||||
pub struct Field(pub u8);
|
||||
|
||||
impl BinarySerializable for Field {
|
||||
|
||||
@@ -6,7 +6,7 @@ use schema::Field;
|
||||
use schema::Value;
|
||||
|
||||
|
||||
#[derive(Debug, Clone, Ord, PartialEq, Eq, PartialOrd)]
|
||||
#[derive(Debug, Clone, Ord, PartialEq, Eq, PartialOrd, RustcEncodable, RustcDecodable)]
|
||||
pub struct FieldValue {
|
||||
pub field: Field,
|
||||
pub value: Value,
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::io;
|
||||
use std::io::Write;
|
||||
use std::io::Read;
|
||||
|
||||
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
|
||||
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, RustcEncodable, RustcDecodable)]
|
||||
pub enum Value {
|
||||
Str(String),
|
||||
U32(u32),
|
||||
|
||||
Reference in New Issue
Block a user