From 841a54546eea101e968370d4dc11682e016e5f13 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 11 Aug 2016 21:18:59 +0900 Subject: [PATCH] added extra doc file --- .travis.yml | 7 +++-- docs/datastruct.md | 61 ++++++++++++++++++++++++++++++++++++ docs/style.css | 37 ++++++++++++++++++++++ docs/tutorial.md | 22 ++++++------- script/build-doc.sh | 10 ++++++ src/postings/freq_handler.rs | 3 +- src/postings/term_info.rs | 11 +++++++ 7 files changed, 135 insertions(+), 16 deletions(-) create mode 100644 docs/datastruct.md create mode 100644 docs/style.css create mode 100755 script/build-doc.sh diff --git a/.travis.yml b/.travis.yml index b33c4ed25..1f4ea5e41 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,6 +34,7 @@ script: travis-cargo bench && travis-cargo doc after_success: - - travis-cargo doc-upload - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then travis-cargo coveralls --no-sudo --verify; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./kcov/build/src/kcov --verify --coveralls-id=$TRAVIS_JOB_ID --exclude-pattern=/.cargo target/kcov target/debug/tantivy-*; fi + - bash ./script/build-doc.sh + - travis-cargo doc-upload + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then travis-cargo coveralls --no-sudo --verify; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./kcov/build/src/kcov --verify --coveralls-id=$TRAVIS_JOB_ID --exclude-pattern=/.cargo target/kcov target/debug/tantivy-*; fi diff --git a/docs/datastruct.md b/docs/datastruct.md new file mode 100644 index 000000000..aefe0c4af --- /dev/null +++ b/docs/datastruct.md @@ -0,0 +1,61 @@ +% Tantivy's datastructure and index format + +This document explains how tantivy works, and specifically +what kind of datastructures are used to index and store the data. + +# An inverted index + +As you may know, an idea central to search engines is to assign a document id +to each document, and build an inverted index, which is simply +a datastructure associating each term (word) to a sorted list of doc ids. + +Such an index then makes it possible to compute the union or +the intersection of the documents containing two terms +in `O(1)` memory and `O(n)` time. + +## Term dictionary + +Tantivy term dicionary (`.term` files) are stored in +a finite state transducer (courtesy of the excellent +[`fst`](https://github.com/BurntSushi/fst) crate). + +For each term, the dictionary associates +a [TermInfo](http://fulmicoton.com/tantivy/tantivy/postings/struct.TermInfo.html). +which contains all of the information required to access the list of doc ids of the doc containing +the term. + +In fact `fst` can only associated terms to a long. [`FstMap`](https://github.com/fulmicoton/tantivy/blob/master/src/datastruct/fstmap.rs) are +in charge to build a KV map on top of it. + + +## Postings + +The posting lists (sorted list of doc ids) are encoded in the `.idx` file. +Optionally, you specify in your schema that you want tf-idf to be encoded +in the index file (if you do not, the index will behave as if all documents +have a term frequency of 1). +Tf-idf scoring requires the term frequency (number of time the term appeared in the field of the document) +for each document. + + +# Segments + +Tantivy's index are divided into segments. +All segments are as many independant structure. + +This has many benefits. For instance, assuming you are +trying to one billion documents, you could split +your corpus into N pieces, index them on Hadoop, copy all +of the resulting segments in the same directory +and edit the index meta.json file to list all of the segments. + +This strong division also simplify a lot multithreaded indexing. +Each thread is actually build its own segment. + + +## + +# Store + +The store +When a document \ No newline at end of file diff --git a/docs/style.css b/docs/style.css new file mode 100644 index 000000000..be35d451d --- /dev/null +++ b/docs/style.css @@ -0,0 +1,37 @@ +body { + max-width: 1000px; + padding-left: 300px; +} + +nav { + width: 300px; + position: fixed; + left: 0; + top: 0; + padding: 30px; + border-bottom: none !important; +} + +nav > ul { + padding-left: 0px; +} + +nav ul, nav li { + list-style: none; +} + +h1.title { + font-size: 2em; +} + +nav a, h1, h2.section-header { + color: #6d6c6c; +} + +nav a { + color: #187ec1; +} + +h1.title { + color: #df3600; +} \ No newline at end of file diff --git a/docs/tutorial.md b/docs/tutorial.md index 167fbff40..71675bcf4 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -1,20 +1,19 @@ -# Indexing Wikipedia with Tantivy CLI interface +% Tutorial: Indexing Wikipedia with Tantivy CLI -## Introduction +# Introduction In this tutorial, we will create a brand new index with the articles of English wikipedia in it. - - -## Step 1 - Get tantivy CLI interface + +# Install There are two ways to get `tantivy`. If you are a rust programmer, you can run `cargo install tantivy`. Alternatively, if you are on `Linux 64bits`, you can download a static binary: [binaries/linux_x86_64/](http://fulmicoton.com/tantivy/binaries/linux_x86_64/tantivy) -## Step 2 - creating the index +# Creating the index Create a directory in which your index will be stored. @@ -40,7 +39,7 @@ the definition of the schema of our new index. When asked answer to the question as follows: -``` +```none Creating new index Let's define it's schema! @@ -114,7 +113,7 @@ If you want to know more about the meaning of these options, you can check out t The json displayed at the end has been written in `wikipedia-index/meta.json`. -# Step 3 - Get the documents to index +# Get the documents to index Tantivy's index command offers a way to index a json file. More accurately, the file must contain one document per line, in a json format. @@ -134,7 +133,7 @@ Make sure to uncompress the file bunzip2 wiki-articles.json.bz2 ``` -# Step 4 - Index the documents. +# Index the documents. The `index` command will index your document. By default it will use as many threads as there are core on your machine. @@ -145,7 +144,8 @@ On my computer (8 core Xeon(R) CPU X3450 @ 2.67GHz), it only takes 7 minutes. cat /data/wiki-articles | tantivy index -i wikipedia-index ``` -# Step 5 - Have a look at the index directory +While it is indexing, you can peek at the index directory +to check what is happening. ```bash ls wikipedia-index @@ -159,7 +159,7 @@ It is named by a uuid. Each different files is storing a different datastructure for the index. -# Step 6 - Serve a search index +# Serve the search index ``` tantivy serve -i wikipedia-index diff --git a/script/build-doc.sh b/script/build-doc.sh new file mode 100755 index 000000000..7c36a9919 --- /dev/null +++ b/script/build-doc.sh @@ -0,0 +1,10 @@ +#!/bin/bash +DEST=target/doc/tantivy/docs/ +mkdir -p $DEST + +for f in $(ls docs/*.md) +do + rustdoc $f -o $DEST --markdown-css ../../rustdoc.css --markdown-css style.css +done + +cp docs/*.css $DEST \ No newline at end of file diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs index 5b2d1a873..a4e7eac45 100644 --- a/src/postings/freq_handler.rs +++ b/src/postings/freq_handler.rs @@ -54,9 +54,8 @@ impl FreqHandler { block_decoder.output(idx) } FreqHandler::NoFreq => { - 0 + 1u32 } } } - } \ No newline at end of file diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index a49f91b9d..adbc16aa3 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -1,6 +1,17 @@ use common::BinarySerializable; use std::io; + +// `TermInfo` contains all of the information +// associated to terms in the `.term` file. +// +// It consists of +// * doc_freq : the number of document in the segment +// containing this term. It is also the length of the +// posting list associated to this term +// * postings_offset: an offset in the `.idx` file +// addressing the start of the posting list associated +// to this term. #[derive(Debug,Ord,PartialOrd,Eq,PartialEq,Clone)] pub struct TermInfo { pub doc_freq: u32,