From 1a08ca4f95b690c6d4ae3f737dd8ab41380dbb82 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 22 Sep 2016 15:45:37 +0900 Subject: [PATCH] Doc --- src/common/mod.rs | 5 +++++ src/common/timer.rs | 10 +++++++++- src/core/index.rs | 5 ++++- src/core/segment_reader.rs | 16 ++++++++++++---- src/directory/error.rs | 1 - src/directory/mod.rs | 2 ++ src/error.rs | 9 ++++++--- src/fastfield/mod.rs | 12 ++++++++++++ src/indexer/index_writer.rs | 10 ++++++++-- src/indexer/segment_serializer.rs | 21 +++++++++++++++------ src/lib.rs | 19 ++++++++++++++----- src/postings/mod.rs | 6 ++++++ src/query/mod.rs | 5 +++++ 13 files changed, 98 insertions(+), 23 deletions(-) diff --git a/src/common/mod.rs b/src/common/mod.rs index d3beadb48..e4322d27e 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -14,8 +14,13 @@ pub fn make_io_err(msg: String) -> io::Error { io::Error::new(io::ErrorKind::Other, msg) } + +/// Has length trait pub trait HasLen { + /// Return length fn len(&self,) -> usize; + + /// Returns true iff empty. fn is_empty(&self,) -> bool { self.len() == 0 } diff --git a/src/common/timer.rs b/src/common/timer.rs index ae1d3959e..3f3950422 100644 --- a/src/common/timer.rs +++ b/src/common/timer.rs @@ -8,6 +8,10 @@ pub struct OpenTimer<'a> { } impl<'a> OpenTimer<'a> { + /// Starts timing a new named subtask + /// + /// The timer is stopped automatically + /// when the `OpenTimer` is dropped. pub fn open(&mut self, name: &'static str) -> OpenTimer { OpenTimer { name: name, @@ -28,6 +32,7 @@ impl<'a> Drop for OpenTimer<'a> { } } +/// Timing recording #[derive(Debug, RustcEncodable)] pub struct Timing { name: &'static str, @@ -35,17 +40,20 @@ pub struct Timing { depth: u32, } +/// Timer tree #[derive(Debug, RustcEncodable)] pub struct TimerTree { timings: Vec, } impl TimerTree { - + + /// Returns the total time elapsed in microseconds pub fn total_time(&self,) -> i64 { self.timings.last().unwrap().duration } + /// Open a new named subtask pub fn open(&mut self, name: &'static str) -> OpenTimer { OpenTimer { name: name, diff --git a/src/core/index.rs b/src/core/index.rs index d1f34c9d8..ac2b287cf 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -150,7 +150,10 @@ impl Index { pub fn writer(&self, heap_size_in_bytes: usize) -> Result { self.writer_with_num_threads(num_cpus::get(), heap_size_in_bytes) } - + + /// Accessor to the index schema + /// + /// The schema is actually cloned. pub fn schema(&self,) -> Schema { self.schema.clone() } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 99153861f..be1e46ec2 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -65,7 +65,8 @@ impl SegmentReader { pub fn num_docs(&self) -> DocId { self.segment_info.max_doc } - + + /// Accessor to a segment's fast field reader given a field. pub fn get_fast_field_reader(&self, field: Field) -> io::Result { let field_entry = self.schema.get_field_entry(field); match *field_entry.field_type() { @@ -80,11 +81,17 @@ impl SegmentReader { } } + /// Accessor to the segment's `Field norms`'s reader. + /// + /// Field norms are the length (in tokens) of the fields. + /// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html). + /// + /// They are simply stored as a fast field, serialized in + /// the `.fieldnorm` file of the segment. pub fn get_fieldnorms_reader(&self, field: Field) -> io::Result { self.fieldnorms_reader.get_field(field) } - - + /// Returns the number of documents containing the term. pub fn doc_freq(&self, term: &Term) -> u32 { match self.get_term_info(term) { @@ -92,7 +99,8 @@ impl SegmentReader { None => 0, } } - + + /// Accessor to the segment's `StoreReader`. pub fn get_store_reader(&self) -> &StoreReader { &self.store_reader } diff --git a/src/directory/error.rs b/src/directory/error.rs index 5beed41e2..a49ea23b7 100644 --- a/src/directory/error.rs +++ b/src/directory/error.rs @@ -1,7 +1,6 @@ use std::path::PathBuf; use std::io; - /// Error that may occur when opening a directory #[derive(Debug)] pub enum OpenDirectoryError { diff --git a/src/directory/mod.rs b/src/directory/mod.rs index 505ac8a7d..241d2888c 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -3,6 +3,8 @@ mod ram_directory; mod directory; mod read_only_source; mod shared_vec_slice; + +/// Errors specific to the directory module. pub mod error; use std::io::{Seek, Write}; diff --git a/src/error.rs b/src/error.rs index a51b369f0..0f7bf1358 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,5 +1,7 @@ #![allow(enum_variant_names)] +/// Definition of Tantivy's error and result. + use std::io; use std::result; use std::path::PathBuf; @@ -10,6 +12,10 @@ use query; use schema; +/// Tantivy result. +pub type Result = result::Result; + + /// Generic tantivy error. /// /// Any specialized error return in tantivy can be converted in `tantivy::Error`. @@ -87,6 +93,3 @@ impl From for Error { } } } - -/// Tantivy result. -pub type Result = result::Result; diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 0ecbf1f30..02dca74e9 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -1,3 +1,15 @@ +/// FastField module +/// +/// FastField are the equivalent of `DocValues` in `Lucene`. +/// FastFields are stored in column-oriented fashion and allow fast +/// random access given a `DocId`. +/// +/// Their performance is comparable to that of an array lookup. +/// FastField are useful when a field is required for all or most of +/// the `DocSet` : for instance for scoring, grouping, filtering, or facetting. +/// +/// Currently only u32 fastfield are supported. + mod reader; mod writer; mod serializer; diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 0b9271567..519d34825 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -36,6 +36,12 @@ type DocumentReceiver = chan::Receiver; type NewSegmentSender = chan::Sender>; type NewSegmentReceiver = chan::Receiver>; +/// `IndexWriter` is the user entry-point to add document to an index. +/// +/// It manages a small number of indexing thread, as well as a shared +/// indexing queue. +/// Each indexing thread builds its own independant `Segment`, via +/// a `SegmentWriter` object. pub struct IndexWriter { index: Index, heap_size_in_bytes_per_thread: usize, @@ -68,7 +74,6 @@ fn index_documents(heap: &mut Heap, } - impl IndexWriter { /// Spawns a new worker thread for indexing. @@ -139,7 +144,8 @@ impl IndexWriter { } Ok(()) } - + + /// Merges a given list of segments pub fn merge(&mut self, segments: &[Segment]) -> Result<()> { let schema = self.index.schema(); let merger = try!(IndexMerger::open(schema, segments)); diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index 6f7ef3ad4..6e27f6a94 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -1,5 +1,4 @@ use Result; -use Error; use std::io::Write; use rustc_serialize::json; @@ -10,6 +9,9 @@ use fastfield::FastFieldSerializer; use store::StoreWriter; use postings::PostingsSerializer; + +/// Segment serializer is in charge of laying out on disk +/// the data accumulated and sorted by the `SegmentWriter`. pub struct SegmentSerializer { segment: Segment, store_writer: StoreWriter, @@ -19,7 +21,8 @@ pub struct SegmentSerializer { } impl SegmentSerializer { - + + /// Creates a new `SegmentSerializer`. pub fn for_segment(segment: &mut Segment) -> Result { let store_write = try!(segment.open_write(SegmentComponent::STORE)); @@ -38,23 +41,28 @@ impl SegmentSerializer { fieldnorms_serializer: fieldnorms_serializer, }) } - + + /// Accessor to the `PostingsSerializer`. pub fn get_postings_serializer(&mut self,) -> &mut PostingsSerializer { &mut self.postings_serializer } + /// Accessor to the `FastFieldSerializer`. pub fn get_fast_field_serializer(&mut self,) -> &mut FastFieldSerializer { &mut self.fast_field_serializer } + /// Accessor to the field norm serializer. pub fn get_fieldnorms_serializer(&mut self,) -> &mut FastFieldSerializer { &mut self.fieldnorms_serializer } - + + /// Accessor to the `StoreWriter`. pub fn get_store_writer(&mut self,) -> &mut StoreWriter { &mut self.store_writer } - + + /// Write the `SegmentInfo` pub fn write_segment_info(&mut self, segment_info: &SegmentInfo) -> Result<()> { let mut write = try!(self.segment.open_write(SegmentComponent::INFO)); let json_data = json::encode(segment_info) @@ -63,7 +71,8 @@ impl SegmentSerializer { try!(write.flush()); Ok(()) } - + + /// Finalize the segment serialization. pub fn close(mut self,) -> Result<()> { try!(self.fast_field_serializer.close()); try!(self.postings_serializer.close()); diff --git a/src/lib.rs b/src/lib.rs index 61cbb6a50..0f335c425 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,6 +10,11 @@ #![warn(missing_docs)] +//! # `tantivy` +//! +//! Tantivy is a search engine library. +//! Think `Lucene`, but in Rust. + #[macro_use] extern crate lazy_static; @@ -61,18 +66,22 @@ pub use error::{Result, Error}; mod analyzer; mod datastruct; -pub mod postings; -pub mod query; -pub mod directory; +/// Query module +pub mod query; +/// Directory module +pub mod directory; +/// Collector module pub mod collector; +/// Postings module (also called inverted index) +pub mod postings; +/// Schema pub mod schema; + pub use directory::Directory; pub use core::searcher::Searcher; - -/// pub use core::Index; pub use indexer::IndexWriter; pub use schema::Term; diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 7f955b9fd..374f08c33 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -1,3 +1,9 @@ +/// Postings module +/// +/// Postings, also called inverted lists, is the key datastructure +/// to full-text search. + + mod postings; mod recorder; mod serializer; diff --git a/src/query/mod.rs b/src/query/mod.rs index bc0ecad23..12af7d35c 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -1,3 +1,8 @@ +/// Query module +/// +/// The query module regroups all of tantivy's query objects +/// + mod query; mod multi_term_query; mod multi_term_accumulator;