issue/174 Added doc, and made field private

This commit is contained in:
Paul Masurel
2017-05-25 14:08:36 +09:00
parent e0fce4782a
commit 87152daef3
4 changed files with 86 additions and 12 deletions

View File

@@ -31,6 +31,11 @@ pub trait HasLen {
const HIGHEST_BIT: u64 = 1 << 63;
/// Maps a `i64` to `u64`
///
/// For simplicity, tantivy internally handles `i64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `i64` to `u64` so that
/// `-2^63 .. 2^63-1` is mapped
/// to
@@ -43,13 +48,15 @@ const HIGHEST_BIT: u64 = 1 << 63;
/// Imagine a list of `i64` ranging from -10 to 10.
/// When casting negative values, the negative values are projected
/// to values over 2^63, and all values end up requiring 64 bits.
///
/// # See also
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
#[inline(always)]
pub fn i64_to_u64(val: i64) -> u64 {
(val as u64) ^ HIGHEST_BIT
}
/// Reverse the mapping given by
/// `i64_to_u64`.
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline(always)]
pub fn u64_to_i64(val: u64) -> i64 {
(val ^ HIGHEST_BIT) as i64

View File

@@ -1,9 +1,43 @@
/*!
Tantivy's store is a compressed, row-oriented storage.
A field needs to be marked as stored in the schema in
order to be handled in the `Store`.
Internally, documents (or rather their stored fields) are serialized to a buffer.
When the buffer exceeds 16K, the buffer is compressed using `LZ4`
and the resulting block is written to disk.
One can then request for a specific `DocId`.
A skip list helps navigating to the right block,
decompresses it entirely and returns the document within it.
If the last document requested was in the same block,
the reader is smart enough to avoid decompressing
the block a second time, but their is no real
*uncompressed block* cache.
A typical use case for the store is, once
the search result page has been computed, returning
the actual content of the 10 best document.
# Usage
Most users should not access the `StoreReader` directly
and should rely on either
- at the segment level, the [`SegmentReader`'s `doc` method](../struct.SegmentReader.html#method.doc)
- at the index level, the [`Searcher`'s `doc` method](../struct.Searcher.html#method.doc)
!*/
mod reader;
mod writer;
pub use self::reader::StoreReader;
pub use self::writer::StoreWriter;
#[cfg(test)]
mod tests {

View File

@@ -11,16 +11,20 @@ use std::io::{self, Read};
use datastruct::SkipList;
use lz4;
/// Reads document off tantivy's [`Store`](./index.html)
#[derive(Clone)]
pub struct StoreReader {
pub data: ReadOnlySource,
pub offset_index_source: ReadOnlySource,
data: ReadOnlySource,
offset_index_source: ReadOnlySource,
current_block_offset: RefCell<usize>,
current_block: RefCell<Vec<u8>>,
pub max_doc: DocId,
max_doc: DocId,
}
impl StoreReader {
/// Opens a store reader
pub fn from_source(data: ReadOnlySource) -> StoreReader {
let (data_source, offset_index_source, max_doc) = split_source(data);
StoreReader {
@@ -55,20 +59,27 @@ impl StoreReader {
Ok(())
}
/// Reads a given document.
///
/// Calling `.get(doc)` is relatively costly as it requires
/// decompressing a LZ4-compressed block.
///
/// It should not be called to score documents
/// for instance.
pub fn get(&self, doc_id: DocId) -> Result<Document> {
let (first_doc_id, block_offset) = self.block_offset(doc_id);
try!(self.read_block(block_offset as usize));
self.read_block(block_offset as usize)?;
let current_block_mut = self.current_block.borrow_mut();
let mut cursor = &current_block_mut[..];
for _ in first_doc_id..doc_id {
let block_length = try!(u32::deserialize(&mut cursor));
cursor = &cursor[block_length as usize..];
}
try!(u32::deserialize(&mut cursor));
u32::deserialize(&mut cursor)?;
let num_fields = u32::deserialize(&mut cursor)?;
let mut field_values = Vec::new();
let num_fields = try!(u32::deserialize(&mut cursor));
for _ in 0..num_fields {
let field_value = try!(FieldValue::deserialize(&mut cursor));
let field_value = FieldValue::deserialize(&mut cursor)?;
field_values.push(field_value);
}
Ok(Document::from(field_values))
@@ -83,7 +94,5 @@ fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId)
let offset = u64::deserialize(&mut serialized_offset_buf).unwrap();
let offset = offset as usize;
let max_doc = u32::deserialize(&mut serialized_offset_buf).unwrap();
let res = (data.slice(0, offset), data.slice(offset, footer_offset), max_doc);
drop(data);
res
(data.slice(0, offset), data.slice(offset, footer_offset), max_doc)
}

View File

@@ -8,6 +8,15 @@ use datastruct::SkipListBuilder;
const BLOCK_SIZE: usize = 16_384;
/// Write tantivy's [`Store`](./index.html)
///
/// Contrary to the other components of `tantivy`,
/// the store is written to disc as document as being added,
/// as opposed to when the segment is getting finalized.
///
/// The skip list index on the other hand, is build in memory.
///
pub struct StoreWriter {
doc: DocId,
written: u64,
@@ -19,6 +28,11 @@ pub struct StoreWriter {
impl StoreWriter {
/// Create a store writer.
///
/// The store writer will writes blocks on disc as
/// document are added.
pub fn new(writer: WritePtr) -> StoreWriter {
StoreWriter {
doc: 0,
@@ -30,6 +44,11 @@ impl StoreWriter {
}
}
/// Store a new document.
///
/// The document id is implicitely the number of times
/// this method has been called.
///
pub fn store<'a>(&mut self, field_values: &[&'a FieldValue]) -> io::Result<()> {
self.intermediary_buffer.clear();
try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer));
@@ -62,6 +81,11 @@ impl StoreWriter {
Ok(())
}
/// Finalized the store writer.
///
/// Compress the last unfinished block if any,
/// and serializes the skip list index on disc.
pub fn close(mut self) -> io::Result<()> {
if !self.current_block.is_empty() {
try!(self.write_and_compress_block());