diff --git a/src/datastruct/stacker/hashmap.rs b/src/datastruct/stacker/hashmap.rs index 4ebb53927..bee80a96e 100644 --- a/src/datastruct/stacker/hashmap.rs +++ b/src/datastruct/stacker/hashmap.rs @@ -2,6 +2,7 @@ use std::iter; use std::marker::PhantomData; use super::heap::{Heap, BytesRef}; +/// dbj2 hash function fn djb2(key: &[u8]) -> u64 { let mut state: u64 = 5381; for &b in key { @@ -19,6 +20,13 @@ impl Default for BytesRef { } } +/// `KeyValue` is the item stored in the hash table. +/// The key is actually a `BytesRef` object stored in an external heap. +/// The value_addr also points to an address in the heap. +/// +/// The key and the value are actually stored contiguously. +/// For this reason, the (start, stop) information is actually redundant +/// and can be simplified in the future #[derive(Copy, Clone, Default)] struct KeyValue { key: BytesRef, @@ -31,6 +39,21 @@ impl KeyValue { } } +pub enum Entry { + Vacant(usize), + Occupied(u32), +} + + +/// Customized HashMap with string keys +/// +/// This `HashMap` takes String as keys. Keys are +/// stored in a user defined heap. +/// +/// The quirky API has the benefit of avoiding +/// the computation of the hash of the key twice, +/// or copying the key as long as there is no insert. +/// pub struct HashMap<'a, V> where V: From { table: Box<[KeyValue]>, heap: &'a Heap, @@ -39,12 +62,6 @@ pub struct HashMap<'a, V> where V: From { occupied: Vec, } -pub enum Entry { - Vacant(usize), - Occupied(u32), -} - - impl<'a, V> HashMap<'a, V> where V: From { pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a, V> { diff --git a/src/directory/directory.rs b/src/directory/directory.rs index 4fed52a04..08f602251 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -64,7 +64,8 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static { /// /// The file may or may not previously exists. fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>; - + + /// Clone the directory and boxes the clone fn box_clone(&self) -> Box; } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 574f76f31..c90459430 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -20,6 +20,11 @@ use indexer::segment_serializer::SegmentSerializer; use datastruct::stacker::Heap; use indexer::index_writer::MARGIN_IN_BYTES; +/// A SegmentWriter is the object in charge of creating segment index from a +/// documents. +/// +/// They creates the postings list in anonymous memory. +/// The segment is layed on disk when the segment gets `finalized`. pub struct SegmentWriter<'a> { heap: &'a Heap, max_doc: DocId, @@ -29,6 +34,7 @@ pub struct SegmentWriter<'a> { fieldnorms_writer: U32FastFieldsWriter, } + fn create_fieldnorms_writer(schema: &Schema) -> U32FastFieldsWriter { let u32_fields: Vec = schema.fields() .iter() @@ -39,6 +45,7 @@ fn create_fieldnorms_writer(schema: &Schema) -> U32FastFieldsWriter { U32FastFieldsWriter::new(u32_fields) } + fn posting_from_field_entry<'a>(field_entry: &FieldEntry, heap: &'a Heap) -> Box { match *field_entry.field_type() { FieldType::Str(ref text_options) => { @@ -61,9 +68,18 @@ fn posting_from_field_entry<'a>(field_entry: &FieldEntry, heap: &'a Heap) -> Box } - impl<'a> SegmentWriter<'a> { - + + + /// Creates a new `SegmentWriter` + /// + /// The arguments are defined as follows + /// + /// - heap: most of the segment writer data (terms, and postings lists recorders) + /// is stored in a user-defined heap object. This makes it possible for the user to define + /// the flushing behavior as a buffer limit + /// - segment: The segment being written + /// - schema pub fn for_segment(heap: &'a Heap, mut segment: Segment, schema: &Schema) -> Result> { let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment)); let mut per_field_postings_writers: Vec> = Vec::new(); @@ -81,13 +97,10 @@ impl<'a> SegmentWriter<'a> { }) } - // Write on disk all of the stuff that - // is still on RAM : - // - the dictionary in an fst - // - the postings - // - the segment info - // The segment writer cannot be used after this, which is - // enforced by the fact that "self" is moved. + /// Lay on disk the current content of the `SegmentWriter` + /// + /// Finalize consumes the `SegmentWriter`, so that it cannot + /// be used afterwards. pub fn finalize(mut self,) -> Result<()> { let segment_info = self.segment_info(); for per_field_postings_writer in &mut self.per_field_postings_writers { @@ -101,10 +114,20 @@ impl<'a> SegmentWriter<'a> { self.heap) } + /// Returns true iff the segment writer's buffer has reached capacity. + /// + /// The limit is defined as `the user defined heap size - an arbitrary margin of 10MB` + /// The `Segment` is `finalize`d when the buffer gets full. + /// + /// Because, we cannot cut through a document, the margin is there to ensure that we rarely + /// exceeds the heap size. pub fn is_buffer_full(&self,) -> bool { self.heap.num_free_bytes() <= MARGIN_IN_BYTES } + /// Indexes a new document + /// + /// As a user, you should rather use `IndexWriter`'s add_document. pub fn add_document(&mut self, doc: &Document, schema: &Schema) -> io::Result<()> { let doc_id = self.max_doc; for (field, field_values) in doc.get_sorted_field_values() { @@ -141,7 +164,6 @@ impl<'a> SegmentWriter<'a> { } } self.fieldnorms_writer.fill_val_up_to(doc_id); - self.fast_field_writers.add_document(doc); let stored_fieldvalues: Vec<&FieldValue> = doc .field_values() @@ -153,20 +175,39 @@ impl<'a> SegmentWriter<'a> { self.max_doc += 1; Ok(()) } - - - fn segment_info(&self,) -> SegmentInfo { + + /// Creates the `SegmentInfo` that will be serialized along + /// with the index in JSON format. + fn segment_info(&self,) -> SegmentInfo { SegmentInfo { max_doc: self.max_doc } } - + + + /// Max doc is + /// - the number of documents in the segment assuming there is no deletes + /// - the maximum document id (including deleted documents) + 1 + /// + /// Currently, **tantivy** does not handle deletes anyway, + /// so `max_doc == num_docs` pub fn max_doc(&self,) -> u32 { self.max_doc } + + /// Number of documents in the index. + /// Deleted documents are not counted. + /// + /// Currently, **tantivy** does not handle deletes anyway, + /// so `max_doc == num_docs` + #[allow(dead_code)] + pub fn num_docs(&self,) -> u32 { + self.max_doc + } } +// This method is used as a trick to workaround the borrow checker fn write<'a>(per_field_postings_writers: &[Box], fast_field_writers: &U32FastFieldsWriter, fieldnorms_writer: &U32FastFieldsWriter, diff --git a/src/query/similarity.rs b/src/query/similarity.rs index f3fbb4409..0c3219b2f 100644 --- a/src/query/similarity.rs +++ b/src/query/similarity.rs @@ -2,6 +2,7 @@ use Score; use query::Explanation; use query::MultiTermAccumulator; +/// Similarity based scoring. pub trait Similarity: MultiTermAccumulator { fn score(&self, ) -> Score; fn explain(&self, vals: &[(usize, u32, u32)]) -> Explanation;