diff --git a/src/common/mod.rs b/src/common/mod.rs index eef6b283d..e8e9facdd 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -31,6 +31,11 @@ pub trait HasLen { const HIGHEST_BIT: u64 = 1 << 63; +/// Maps a `i64` to `u64` +/// +/// For simplicity, tantivy internally handles `i64` as `u64`. +/// The mapping is defined by this function. +/// /// Maps `i64` to `u64` so that /// `-2^63 .. 2^63-1` is mapped /// to @@ -43,13 +48,15 @@ const HIGHEST_BIT: u64 = 1 << 63; /// Imagine a list of `i64` ranging from -10 to 10. /// When casting negative values, the negative values are projected /// to values over 2^63, and all values end up requiring 64 bits. +/// +/// # See also +/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html). #[inline(always)] pub fn i64_to_u64(val: i64) -> u64 { (val as u64) ^ HIGHEST_BIT } -/// Reverse the mapping given by -/// `i64_to_u64`. +/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html). #[inline(always)] pub fn u64_to_i64(val: u64) -> i64 { (val ^ HIGHEST_BIT) as i64 diff --git a/src/store/mod.rs b/src/store/mod.rs index 8e2431d03..ba2fbc529 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -1,9 +1,43 @@ +/*! +Tantivy's store is a compressed, row-oriented storage. + +A field needs to be marked as stored in the schema in +order to be handled in the `Store`. + +Internally, documents (or rather their stored fields) are serialized to a buffer. +When the buffer exceeds 16K, the buffer is compressed using `LZ4` +and the resulting block is written to disk. + +One can then request for a specific `DocId`. +A skip list helps navigating to the right block, +decompresses it entirely and returns the document within it. + +If the last document requested was in the same block, +the reader is smart enough to avoid decompressing +the block a second time, but their is no real +*uncompressed block* cache. + +A typical use case for the store is, once +the search result page has been computed, returning +the actual content of the 10 best document. + +# Usage + +Most users should not access the `StoreReader` directly +and should rely on either + +- at the segment level, the [`SegmentReader`'s `doc` method](../struct.SegmentReader.html#method.doc) +- at the index level, the [`Searcher`'s `doc` method](../struct.Searcher.html#method.doc) + +!*/ + mod reader; mod writer; pub use self::reader::StoreReader; pub use self::writer::StoreWriter; + #[cfg(test)] mod tests { diff --git a/src/store/reader.rs b/src/store/reader.rs index 329ce5ae7..9640a54a7 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -11,16 +11,20 @@ use std::io::{self, Read}; use datastruct::SkipList; use lz4; + +/// Reads document off tantivy's [`Store`](./index.html) #[derive(Clone)] pub struct StoreReader { - pub data: ReadOnlySource, - pub offset_index_source: ReadOnlySource, + data: ReadOnlySource, + offset_index_source: ReadOnlySource, current_block_offset: RefCell, current_block: RefCell>, - pub max_doc: DocId, + max_doc: DocId, } impl StoreReader { + + /// Opens a store reader pub fn from_source(data: ReadOnlySource) -> StoreReader { let (data_source, offset_index_source, max_doc) = split_source(data); StoreReader { @@ -55,20 +59,27 @@ impl StoreReader { Ok(()) } + /// Reads a given document. + /// + /// Calling `.get(doc)` is relatively costly as it requires + /// decompressing a LZ4-compressed block. + /// + /// It should not be called to score documents + /// for instance. pub fn get(&self, doc_id: DocId) -> Result { let (first_doc_id, block_offset) = self.block_offset(doc_id); - try!(self.read_block(block_offset as usize)); + self.read_block(block_offset as usize)?; let current_block_mut = self.current_block.borrow_mut(); let mut cursor = ¤t_block_mut[..]; for _ in first_doc_id..doc_id { let block_length = try!(u32::deserialize(&mut cursor)); cursor = &cursor[block_length as usize..]; } - try!(u32::deserialize(&mut cursor)); + u32::deserialize(&mut cursor)?; + let num_fields = u32::deserialize(&mut cursor)?; let mut field_values = Vec::new(); - let num_fields = try!(u32::deserialize(&mut cursor)); for _ in 0..num_fields { - let field_value = try!(FieldValue::deserialize(&mut cursor)); + let field_value = FieldValue::deserialize(&mut cursor)?; field_values.push(field_value); } Ok(Document::from(field_values)) @@ -83,7 +94,5 @@ fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) let offset = u64::deserialize(&mut serialized_offset_buf).unwrap(); let offset = offset as usize; let max_doc = u32::deserialize(&mut serialized_offset_buf).unwrap(); - let res = (data.slice(0, offset), data.slice(offset, footer_offset), max_doc); - drop(data); - res + (data.slice(0, offset), data.slice(offset, footer_offset), max_doc) } diff --git a/src/store/writer.rs b/src/store/writer.rs index 1221a46ec..5d2959aa5 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -8,6 +8,15 @@ use datastruct::SkipListBuilder; const BLOCK_SIZE: usize = 16_384; + +/// Write tantivy's [`Store`](./index.html) +/// +/// Contrary to the other components of `tantivy`, +/// the store is written to disc as document as being added, +/// as opposed to when the segment is getting finalized. +/// +/// The skip list index on the other hand, is build in memory. +/// pub struct StoreWriter { doc: DocId, written: u64, @@ -19,6 +28,11 @@ pub struct StoreWriter { impl StoreWriter { + + /// Create a store writer. + /// + /// The store writer will writes blocks on disc as + /// document are added. pub fn new(writer: WritePtr) -> StoreWriter { StoreWriter { doc: 0, @@ -30,6 +44,11 @@ impl StoreWriter { } } + /// Store a new document. + /// + /// The document id is implicitely the number of times + /// this method has been called. + /// pub fn store<'a>(&mut self, field_values: &[&'a FieldValue]) -> io::Result<()> { self.intermediary_buffer.clear(); try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer)); @@ -62,6 +81,11 @@ impl StoreWriter { Ok(()) } + + /// Finalized the store writer. + /// + /// Compress the last unfinished block if any, + /// and serializes the skip list index on disc. pub fn close(mut self) -> io::Result<()> { if !self.current_block.is_empty() { try!(self.write_and_compress_block());