diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index 456db4858..f765a3956 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -4,6 +4,30 @@ use common::serialize::BinarySerializable; use std::mem; +/// Computes the number of bits that will be used for bitpacking. +/// +/// In general the target is the minimum number of bits +/// required to express the amplitude given in argument. +/// +/// e.g. If the amplitude is 10, we can store all ints on simply 4bits. +/// +/// The logic is slightly more convoluted here as for optimization +/// reasons, we want to ensure that a value spawns over at most 8 bytes +/// of aligns bytes. +/// +/// Spawning over 9 bytes is possible for instance, if we do +/// bitpacking with an amplitude of 63 bits. +/// In this case, the second int will start on bit +/// 63 (which belongs to byte 7) and ends at byte 15; +/// Hence 9 bytes (from byte 7 to byte 15 included). +/// +/// To avoid this, we force the number of bits to 64bits +/// when the result is greater than `64-8 = 56 bits`. +/// +/// Note that this only affects rare use cases spawning over +/// a very large range of values. Even in this case, it results +/// in an extra cost of at most 12% compared to the optimal +/// number of bits. pub fn compute_num_bits(amplitude: u64) -> u8 { let amplitude = (64u32 - amplitude.leading_zeros()) as u8; if amplitude <= 64 - 8 { diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 846136179..a8d956576 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -1,14 +1,25 @@ -/// Fast field module -/// -/// Fast fields are the equivalent of `DocValues` in `Lucene`. -/// Fast fields are stored in column-oriented fashion and allow fast -/// random access given a `DocId`. -/// -/// Their performance is comparable to that of an array lookup. -/// They are useful when a field is required for all or most of -/// the `DocSet` : for instance for scoring, grouping, filtering, or facetting. -/// -/// Currently only u64 fastfield are supported. +//! # Fast fields +//! +//! Fast fields are the equivalent of `DocValues` in `Lucene`. +//! Fast fields is a non-compressed column-oriented fashion storage +//! of `tantivy`. +//! +//! It is designed for the fast random access of some document +//! fields given a document id. +//! +//! `FastField` are useful when a field is required for all or most of +//! the `DocSet` : for instance for scoring, grouping, filtering, or facetting. +//! +//! +//! Fields have to be declared as `FAST` in the schema. +//! Currently only 64-bits integers (signed or unsigned) are +//! supported. +//! +//! They are stored in a bitpacked fashion so that their +//! memory usage is directly linear with the amplitude of the +//! values stored. +//! +//! Read access performance is comparable to that of an array lookup. mod reader; mod writer; diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index f90edfeea..23229184c 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -14,7 +14,10 @@ use common::bitpacker::BitUnpacker; use schema::FieldType; use common; - +/// Trait for accessing a fastfield. +/// +/// Depending on the field type, a different +/// fast field is required. pub trait FastFieldReader: Sized { /// Type of the value stored in the fastfield. @@ -35,6 +38,7 @@ pub trait FastFieldReader: Sized { fn is_enabled(field_type: &FieldType) -> bool; } +/// FastFieldReader for unsigned 64-bits integers. pub struct U64FastFieldReader { _data: ReadOnlySource, bit_unpacker: BitUnpacker, @@ -131,8 +135,7 @@ impl From> for U64FastFieldReader { } } - - +/// FastFieldReader for signed 64-bits integers. pub struct I64FastFieldReader { underlying: U64FastFieldReader, } @@ -192,7 +195,11 @@ impl FastFieldReader for I64FastFieldReader { - +/// The FastFieldsReader` is the datastructure containing +/// all of the fast fields' data. +/// +/// It contains a mapping that associated these fields to +/// the proper slice in the fastfield reader file. pub struct FastFieldsReader { source: ReadOnlySource, field_offsets: HashMap, @@ -200,6 +207,11 @@ pub struct FastFieldsReader { impl FastFieldsReader { + /// Opens the `FastFieldsReader` file + /// + /// When opening the fast field reader, the + /// the list of the offset is read (as a footer of the + /// data file). pub fn open(source: ReadOnlySource) -> io::Result { let header_offset; let field_offsets: Vec<(Field, u32)>; @@ -207,11 +219,11 @@ impl FastFieldsReader { let buffer = source.as_slice(); { let mut cursor = buffer; - header_offset = try!(u32::deserialize(&mut cursor)); + header_offset = u32::deserialize(&mut cursor)?; } { let mut cursor = &buffer[header_offset as usize..]; - field_offsets = try!(Vec::deserialize(&mut cursor)); + field_offsets = Vec::deserialize(&mut cursor)?; } } let mut end_offsets: Vec = field_offsets diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 02c92df26..396f03606 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -6,6 +6,7 @@ use DocId; use common; use schema::FieldType; +/// The fastfieldswriter regroup all of the fast field writers. pub struct FastFieldsWriter { field_writers: Vec, } diff --git a/src/lib.rs b/src/lib.rs index fbf60ecf2..c11397465 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -92,7 +92,6 @@ pub type Result = std::result::Result; mod core; mod compression; - mod store; mod indexer; mod common; @@ -112,7 +111,7 @@ pub mod collector; pub mod postings; /// Schema pub mod schema; -// FastField module + pub mod fastfield; diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 417598951..de477975c 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -31,6 +31,7 @@ pub enum FieldType { impl FieldType { + /// returns true iff the field is indexed. pub fn is_indexed(&self) -> bool { match self { &FieldType::Str(ref text_options) => {