From 841a54546eea101e968370d4dc11682e016e5f13 Mon Sep 17 00:00:00 2001
From: Paul Masurel <paul.masurel@gmail.com>
Date: Thu, 11 Aug 2016 21:18:59 +0900
Subject: [PATCH] added extra doc file

---
 .travis.yml                  |  7 +++--
 docs/datastruct.md           | 61 ++++++++++++++++++++++++++++++++++++
 docs/style.css               | 37 ++++++++++++++++++++++
 docs/tutorial.md             | 22 ++++++-------
 script/build-doc.sh          | 10 ++++++
 src/postings/freq_handler.rs |  3 +-
 src/postings/term_info.rs    | 11 +++++++
 7 files changed, 135 insertions(+), 16 deletions(-)
 create mode 100644 docs/datastruct.md
 create mode 100644 docs/style.css
 create mode 100755 script/build-doc.sh

diff --git a/.travis.yml b/.travis.yml
index b33c4ed25..1f4ea5e41 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,6 +34,7 @@ script:
     travis-cargo bench &&
     travis-cargo doc
 after_success:
-    - travis-cargo doc-upload
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then travis-cargo coveralls --no-sudo --verify; fi
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./kcov/build/src/kcov --verify --coveralls-id=$TRAVIS_JOB_ID --exclude-pattern=/.cargo target/kcov target/debug/tantivy-*; fi
+  - bash ./script/build-doc.sh
+  - travis-cargo doc-upload
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then travis-cargo coveralls --no-sudo --verify; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./kcov/build/src/kcov --verify --coveralls-id=$TRAVIS_JOB_ID --exclude-pattern=/.cargo target/kcov target/debug/tantivy-*; fi
diff --git a/docs/datastruct.md b/docs/datastruct.md
new file mode 100644
index 000000000..aefe0c4af
--- /dev/null
+++ b/docs/datastruct.md
@@ -0,0 +1,61 @@
+% Tantivy's datastructure and index format
+
+This document explains how tantivy works, and specifically 
+what kind of datastructures are used to index and store the data.
+
+# An inverted index
+
+As you may know, an idea central to search engines is to assign a document id 
+to each document, and build an inverted index, which is simply
+a datastructure associating each term (word) to a sorted list of doc ids.   
+
+Such an index then makes it possible to compute the union or
+the intersection of the documents containing two terms
+in `O(1)` memory and `O(n)` time.
+
+## Term dictionary
+
+Tantivy term dicionary (`.term` files) are stored in
+a finite state transducer (courtesy of the excellent
+[`fst`](https://github.com/BurntSushi/fst) crate).
+
+For each term, the dictionary associates
+a [TermInfo](http://fulmicoton.com/tantivy/tantivy/postings/struct.TermInfo.html). 
+which contains all of the information required to access the list of doc ids of the doc containing
+the term.
+
+In fact `fst` can only associated terms to a long. [`FstMap`](https://github.com/fulmicoton/tantivy/blob/master/src/datastruct/fstmap.rs) are
+in charge to build a KV map on top of it.  
+
+
+## Postings
+
+The posting lists (sorted list of doc ids) are encoded in the `.idx` file.
+Optionally, you specify in your schema that you want tf-idf to be encoded
+in the index file (if you do not, the index will behave as if all documents
+have a term frequency of 1).
+Tf-idf scoring requires the term frequency (number of time the term appeared in the field of the document)
+for each document.
+
+
+# Segments
+
+Tantivy's index are divided into segments.
+All segments are as many independant structure.
+
+This has many benefits. For instance, assuming you are
+trying to one billion documents, you could split
+your corpus into N pieces, index them on Hadoop, copy all
+of the resulting segments in the same directory 
+and edit the index meta.json file to list all of the segments.
+
+This strong division also simplify a lot multithreaded indexing.
+Each thread is actually build its own segment.
+
+
+## 
+
+# Store
+
+The store 
+When a document  
\ No newline at end of file
diff --git a/docs/style.css b/docs/style.css
new file mode 100644
index 000000000..be35d451d
--- /dev/null
+++ b/docs/style.css
@@ -0,0 +1,37 @@
+body {
+    max-width: 1000px;
+    padding-left: 300px;
+}
+
+nav {
+    width: 300px;
+    position: fixed;
+    left: 0;
+    top: 0;
+    padding: 30px;
+    border-bottom: none !important;
+}
+
+nav > ul {
+    padding-left: 0px;
+}
+
+nav ul, nav li {
+    list-style: none;
+}
+
+h1.title {
+    font-size: 2em;
+}
+
+nav a, h1, h2.section-header {
+    color: #6d6c6c;
+}
+
+nav a {
+    color: #187ec1;
+}
+
+h1.title {
+    color: #df3600;
+}
\ No newline at end of file
diff --git a/docs/tutorial.md b/docs/tutorial.md
index 167fbff40..71675bcf4 100644
--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@@ -1,20 +1,19 @@
-# Indexing Wikipedia with Tantivy CLI interface
+% Tutorial: Indexing Wikipedia with Tantivy CLI
 
-## Introduction
+# Introduction
 
 In this tutorial, we will create a brand new index
 with the articles of English wikipedia in it.
 
- 
- 
-## Step 1 - Get tantivy CLI interface
+
+# Install
 
 There are two ways to get `tantivy`.
 If you are a rust programmer, you can run `cargo install tantivy`.
 Alternatively, if you are on `Linux 64bits`, you can download a
 static binary:  [binaries/linux_x86_64/](http://fulmicoton.com/tantivy/binaries/linux_x86_64/tantivy) 
 
-## Step 2 - creating the index
+# Creating the index
 
 Create a directory in which your index will be stored.
 
@@ -40,7 +39,7 @@ the definition of the schema of our new index.
 
 When asked answer to the question as follows:
 
-```
+```none
     Creating new index 
     Let's define it's schema! 
 
@@ -114,7 +113,7 @@ If you want to know more about the meaning of these options, you can check out t
 The json displayed at the end has been written in `wikipedia-index/meta.json`.
 
 
-# Step 3 - Get the documents to index
+# Get the documents to index
 
 Tantivy's index command offers a way to index a json file.
 More accurately, the file must contain one document per line, in a json format.
@@ -134,7 +133,7 @@ Make sure to uncompress the file
     bunzip2 wiki-articles.json.bz2
 ``` 
 
-# Step 4 -  Index the documents.
+# Index the documents.
 
 The `index` command will index your document.
 By default it will use as many threads as there are core on your machine.
@@ -145,7 +144,8 @@ On my computer (8 core Xeon(R) CPU X3450  @ 2.67GHz), it only takes 7 minutes.
     cat /data/wiki-articles | tantivy index -i wikipedia-index
 ```
 
-# Step 5 - Have a look at the index directory
+While it is indexing, you can peek at the index directory
+to check what is happening.
 
 ```bash
     ls wikipedia-index
@@ -159,7 +159,7 @@ It is named by a uuid.
 Each different files is storing a different datastructure for the index.
 
 
-# Step 6 - Serve a search index
+# Serve the search index
 
 ```
     tantivy serve -i wikipedia-index
diff --git a/script/build-doc.sh b/script/build-doc.sh
new file mode 100755
index 000000000..7c36a9919
--- /dev/null
+++ b/script/build-doc.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+DEST=target/doc/tantivy/docs/
+mkdir -p $DEST
+
+for f in $(ls docs/*.md)
+do
+    rustdoc $f -o $DEST --markdown-css ../../rustdoc.css --markdown-css style.css
+done
+
+cp docs/*.css $DEST
\ No newline at end of file
diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs
index 5b2d1a873..a4e7eac45 100644
--- a/src/postings/freq_handler.rs
+++ b/src/postings/freq_handler.rs
@@ -54,9 +54,8 @@ impl FreqHandler {
                 block_decoder.output(idx)
             }
             FreqHandler::NoFreq => {
-                0
+                1u32
             }
         }
     }
-
 }
\ No newline at end of file
diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs
index a49f91b9d..adbc16aa3 100644
--- a/src/postings/term_info.rs
+++ b/src/postings/term_info.rs
@@ -1,6 +1,17 @@
 use common::BinarySerializable;
 use std::io;
 
+
+// `TermInfo` contains all of the information 
+// associated to terms in the `.term` file.
+// 
+// It consists of
+// * doc_freq : the number of document in the segment
+// containing this term. It is also the length of the
+// posting list associated to this term
+// * postings_offset: an offset in the `.idx` file 
+// addressing the start of the posting list associated
+// to this term.
 #[derive(Debug,Ord,PartialOrd,Eq,PartialEq,Clone)]
 pub struct TermInfo {
     pub doc_freq: u32,