NOBUG Adding documentation

2026-05-20 18:20:41 +00:00 · 2016-09-20 08:58:43 +09:00
parent 17715fe84c
commit e8d5baa44b
4 changed files with 81 additions and 21 deletions
--- a/src/datastruct/stacker/hashmap.rs
+++ b/src/datastruct/stacker/hashmap.rs
@@ -2,6 +2,7 @@ use std::iter;
 use std::marker::PhantomData;
 use super::heap::{Heap, BytesRef};

+/// dbj2 hash function
 fn djb2(key: &[u8]) -> u64 {
    let mut state: u64 = 5381; 
    for &b in key {
@@ -19,6 +20,13 @@ impl Default for BytesRef {
    }
 }

+/// `KeyValue` is the item stored in the hash table.
+/// The key is actually a `BytesRef` object stored in an external heap.
+/// The value_addr also points to an address in the heap.
+///
+/// The key and the value are actually stored contiguously.
+/// For this reason, the (start, stop) information is actually redundant
+/// and can be simplified in the future 
 #[derive(Copy, Clone, Default)]
 struct KeyValue {
    key: BytesRef,
@@ -31,6 +39,21 @@ impl KeyValue {
    }
 }

+pub enum Entry {
+    Vacant(usize),
+    Occupied(u32),
+}
+
+
+/// Customized HashMap with string keys
+/// 
+/// This `HashMap` takes String as keys. Keys are
+/// stored in a user defined heap.
+///
+/// The quirky API has the benefit of avoiding
+/// the computation of the hash of the key twice,
+/// or copying the key as long as there is no insert.
+///
 pub struct HashMap<'a, V> where V: From<u32> {
    table: Box<[KeyValue]>,
    heap: &'a Heap,
@@ -39,12 +62,6 @@ pub struct HashMap<'a, V> where V: From<u32> {
    occupied: Vec<usize>,
 }

-pub enum Entry {
-    Vacant(usize),
-    Occupied(u32),
-}
-
-
 impl<'a, V> HashMap<'a, V> where V: From<u32> {

    pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a, V> {
--- a/src/directory/directory.rs
+++ b/src/directory/directory.rs
@@ -64,7 +64,8 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
    /// 
    /// The file may or may not previously exists.
    fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
-
+        
+    /// Clone the directory and boxes the clone 
    fn box_clone(&self) -> Box<Directory>;
 }

--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -20,6 +20,11 @@ use indexer::segment_serializer::SegmentSerializer;
 use datastruct::stacker::Heap;
 use indexer::index_writer::MARGIN_IN_BYTES;

+/// A SegmentWriter is the object in charge of creating segment index from a
+/// documents.
+///  
+/// They creates the postings list in anonymous memory.
+/// The segment is layed on disk when the segment gets `finalized`.
 pub struct SegmentWriter<'a> {
 	heap: &'a Heap,
    max_doc: DocId,
@@ -29,6 +34,7 @@ pub struct SegmentWriter<'a> {
 	fieldnorms_writer: U32FastFieldsWriter,
 }

+
 fn create_fieldnorms_writer(schema: &Schema) -> U32FastFieldsWriter {
 	let u32_fields: Vec<Field> = schema.fields()
 		.iter()
@@ -39,6 +45,7 @@ fn create_fieldnorms_writer(schema: &Schema) -> U32FastFieldsWriter {
 	U32FastFieldsWriter::new(u32_fields)
 }

+
 fn posting_from_field_entry<'a>(field_entry: &FieldEntry, heap: &'a Heap) -> Box<PostingsWriter + 'a> {
 	match *field_entry.field_type() {
 		FieldType::Str(ref text_options) => {
@@ -61,9 +68,18 @@ fn posting_from_field_entry<'a>(field_entry: &FieldEntry, heap: &'a Heap) -> Box
 }


-
 impl<'a> SegmentWriter<'a> {
-
+	
+	
+	/// Creates a new `SegmentWriter`
+	///
+	/// The arguments are defined as follows
+	///
+	/// - heap: most of the segment writer data (terms, and postings lists recorders)
+	/// is stored in a user-defined heap object. This makes it possible for the user to define
+	/// the flushing behavior as a buffer limit
+	/// - segment: The segment being written  
+	/// - schema
 	pub fn for_segment(heap: &'a Heap, mut segment: Segment, schema: &Schema) -> Result<SegmentWriter<'a>> {
 		let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment));
 		let mut per_field_postings_writers: Vec<Box<PostingsWriter + 'a>> = Vec::new();
@@ -81,13 +97,10 @@ impl<'a> SegmentWriter<'a> {
 		})
 	}
 	
-	// Write on disk all of the stuff that
-	// is still on RAM :
-	// - the dictionary in an fst
-	// - the postings
-	// - the segment info
-	// The segment writer cannot be used after this, which is
-	// enforced by the fact that "self" is moved.
+	/// Lay on disk the current content of the `SegmentWriter`
+	/// 
+	/// Finalize consumes the `SegmentWriter`, so that it cannot 
+	/// be used afterwards.
 	pub fn finalize(mut self,) -> Result<()> {
 		let segment_info = self.segment_info();
 		for per_field_postings_writer in &mut self.per_field_postings_writers {
@@ -101,10 +114,20 @@ impl<'a> SegmentWriter<'a> {
 			  self.heap)
 	}
 	
+	/// Returns true iff the segment writer's buffer has reached capacity.
+	///
+	/// The limit is defined as `the user defined heap size - an arbitrary margin of 10MB`
+	/// The `Segment` is `finalize`d when the buffer gets full.
+	///
+	/// Because, we cannot cut through a document, the margin is there to ensure that we rarely
+	/// exceeds the heap size.  
 	pub fn is_buffer_full(&self,) -> bool {
 		self.heap.num_free_bytes() <= MARGIN_IN_BYTES
 	}
 	
+	/// Indexes a new document
+	///
+	/// As a user, you should rather use `IndexWriter`'s add_document.
    pub fn add_document(&mut self, doc: &Document, schema: &Schema) -> io::Result<()> {
        let doc_id = self.max_doc;
        for (field, field_values) in doc.get_sorted_field_values() {
@@ -141,7 +164,6 @@ impl<'a> SegmentWriter<'a> {
 			}
 		}
 		self.fieldnorms_writer.fill_val_up_to(doc_id);
-		
 		self.fast_field_writers.add_document(doc);
 		let stored_fieldvalues: Vec<&FieldValue> = doc
 			.field_values()
@@ -153,20 +175,39 @@ impl<'a> SegmentWriter<'a> {
        self.max_doc += 1;
 		Ok(())
    }
-
-
-	fn segment_info(&self,) -> SegmentInfo {
+	
+	/// Creates the `SegmentInfo` that will be serialized along
+	/// with the index in JSON format.  
+ 	fn segment_info(&self,) -> SegmentInfo {
 		SegmentInfo {
 			max_doc: self.max_doc
 		}
 	}
-
+	
+	
+	/// Max doc is 
+	/// - the number of documents in the segment assuming there is no deletes
+	/// - the maximum document id (including deleted documents) + 1
+	///
+	/// Currently, **tantivy** does not handle deletes anyway,
+	/// so `max_doc == num_docs`  
 	pub fn max_doc(&self,) -> u32 {
 		self.max_doc
 	}
+	
+	/// Number of documents in the index.
+	/// Deleted documents are not counted.
+	///
+	/// Currently, **tantivy** does not handle deletes anyway,
+	/// so `max_doc == num_docs`
+	#[allow(dead_code)]
+	pub fn num_docs(&self,) -> u32 {
+		self.max_doc
+	}

 }

+// This method is used as a trick to workaround the borrow checker
 fn write<'a>(per_field_postings_writers: &[Box<PostingsWriter + 'a>],
 		 fast_field_writers: &U32FastFieldsWriter,
 		 fieldnorms_writer: &U32FastFieldsWriter,
--- a/src/query/similarity.rs
+++ b/src/query/similarity.rs
@@ -2,6 +2,7 @@ use Score;
 use query::Explanation;
 use query::MultiTermAccumulator;

+/// Similarity based scoring.
 pub trait Similarity: MultiTermAccumulator {
    fn score(&self, ) -> Score;
    fn explain(&self, vals: &[(usize, u32, u32)]) -> Explanation;