issue/174 Added doc, and made field private

2026-05-27 13:40:49 +00:00 · 2017-05-25 14:08:36 +09:00
parent e0fce4782a
commit 87152daef3
4 changed files with 86 additions and 12 deletions
--- a/src/common/mod.rs
+++ b/src/common/mod.rs
@@ -31,6 +31,11 @@ pub trait HasLen {
 const HIGHEST_BIT: u64 = 1 << 63;


+/// Maps a `i64` to `u64`
+///
+/// For simplicity, tantivy internally handles `i64` as `u64`.
+/// The mapping is defined by this function.
+///
 /// Maps `i64` to `u64` so that
 /// `-2^63 .. 2^63-1` is mapped
 ///     to
@@ -43,13 +48,15 @@ const HIGHEST_BIT: u64 = 1 << 63;
 /// Imagine a list of `i64` ranging from -10 to 10.
 /// When casting negative values, the negative values are projected
 /// to values over 2^63, and all values end up requiring 64 bits.
+///
+/// # See also
+/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
 #[inline(always)]
 pub fn i64_to_u64(val: i64) -> u64 {
    (val as u64) ^ HIGHEST_BIT
 }

-/// Reverse the mapping given by
-/// `i64_to_u64`.
+/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
 #[inline(always)]
 pub fn u64_to_i64(val: u64) -> i64 {
    (val ^ HIGHEST_BIT) as i64
--- a/src/store/mod.rs
+++ b/src/store/mod.rs
@@ -1,9 +1,43 @@
+/*!
+Tantivy's store is a compressed, row-oriented storage.
+
+A field needs to be marked as stored in the schema in 
+order to be handled in the `Store`.
+
+Internally, documents (or rather their stored fields) are serialized to a buffer.
+When the buffer exceeds 16K, the buffer is compressed using `LZ4`
+and the resulting block is written to disk.
+
+One can then request for a specific `DocId`. 
+A skip list helps navigating to the right block,
+decompresses it entirely and returns the document within it.
+
+If the last document requested was in the same block,
+the reader is smart enough to avoid decompressing 
+the block a second time, but their is no real 
+*uncompressed block* cache.
+
+A typical use case for the store is, once
+the search result page has been computed, returning
+the actual content of the 10 best document.
+
+# Usage
+
+Most users should not access the `StoreReader` directly
+and should rely on either
+
+- at the segment level, the [`SegmentReader`'s `doc` method](../struct.SegmentReader.html#method.doc)
+- at the index level,  the [`Searcher`'s `doc` method](../struct.Searcher.html#method.doc)
+
+!*/
+
 mod reader;
 mod writer;
 pub use self::reader::StoreReader;
 pub use self::writer::StoreWriter;


+
 #[cfg(test)]
 mod tests {

--- a/src/store/reader.rs
+++ b/src/store/reader.rs
@@ -11,16 +11,20 @@ use std::io::{self, Read};
 use datastruct::SkipList;
 use lz4;

+
+/// Reads document off tantivy's [`Store`](./index.html)
 #[derive(Clone)]
 pub struct StoreReader {
-    pub data: ReadOnlySource,
-    pub offset_index_source: ReadOnlySource,
+    data: ReadOnlySource,
+    offset_index_source: ReadOnlySource,
    current_block_offset: RefCell<usize>,
    current_block: RefCell<Vec<u8>>,
-    pub max_doc: DocId,
+    max_doc: DocId,
 }

 impl StoreReader {
+
+    /// Opens a store reader
    pub fn from_source(data: ReadOnlySource) -> StoreReader {
        let (data_source, offset_index_source, max_doc) = split_source(data);
        StoreReader {
@@ -55,20 +59,27 @@ impl StoreReader {
        Ok(())
    }

+    /// Reads a given document.
+    ///
+    /// Calling `.get(doc)` is relatively costly as it requires
+    /// decompressing a LZ4-compressed block.
+    ///
+    /// It should not be called to score documents
+    /// for instance.
    pub fn get(&self, doc_id: DocId) -> Result<Document> {
        let (first_doc_id, block_offset) = self.block_offset(doc_id);
-        try!(self.read_block(block_offset as usize));
+        self.read_block(block_offset as usize)?;
        let current_block_mut = self.current_block.borrow_mut();
        let mut cursor = &current_block_mut[..];
        for _ in first_doc_id..doc_id {
            let block_length = try!(u32::deserialize(&mut cursor));
            cursor = &cursor[block_length as usize..];
        }
-        try!(u32::deserialize(&mut cursor));
+        u32::deserialize(&mut cursor)?;
+        let num_fields = u32::deserialize(&mut cursor)?;
        let mut field_values = Vec::new();
-        let num_fields = try!(u32::deserialize(&mut cursor));
        for _ in 0..num_fields {
-            let field_value = try!(FieldValue::deserialize(&mut cursor));
+            let field_value = FieldValue::deserialize(&mut cursor)?;
            field_values.push(field_value);
        }
        Ok(Document::from(field_values))
@@ -83,7 +94,5 @@ fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId)
    let offset = u64::deserialize(&mut serialized_offset_buf).unwrap();
    let offset = offset as usize;
    let max_doc = u32::deserialize(&mut serialized_offset_buf).unwrap();
-    let res = (data.slice(0, offset), data.slice(offset, footer_offset), max_doc);
-    drop(data);
-    res
+    (data.slice(0, offset), data.slice(offset, footer_offset), max_doc)
 }
--- a/src/store/writer.rs
+++ b/src/store/writer.rs
@@ -8,6 +8,15 @@ use datastruct::SkipListBuilder;

 const BLOCK_SIZE: usize = 16_384;

+
+/// Write tantivy's [`Store`](./index.html)
+///
+/// Contrary to the other components of `tantivy`,
+/// the store is written to disc as document as being added, 
+/// as opposed to when the segment is getting finalized.
+///
+/// The skip list index on the other hand, is build in memory.
+///
 pub struct StoreWriter {
    doc: DocId,
    written: u64,
@@ -19,6 +28,11 @@ pub struct StoreWriter {


 impl StoreWriter {
+
+    /// Create a store writer.
+    ///
+    /// The store writer will writes blocks on disc as
+    /// document are added.
    pub fn new(writer: WritePtr) -> StoreWriter {
        StoreWriter {
            doc: 0,
@@ -30,6 +44,11 @@ impl StoreWriter {
        }
    }

+    /// Store a new document.
+    /// 
+    /// The document id is implicitely the number of times
+    /// this method has been called.
+    ///
    pub fn store<'a>(&mut self, field_values: &[&'a FieldValue]) -> io::Result<()> {
        self.intermediary_buffer.clear();
        try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer));
@@ -62,6 +81,11 @@ impl StoreWriter {
        Ok(())
    }

+
+    /// Finalized the store writer.
+    ///
+    /// Compress the last unfinished block if any,
+    /// and serializes the skip list index on disc.
    pub fn close(mut self) -> io::Result<()> {
        if !self.current_block.is_empty() {
            try!(self.write_and_compress_block());