From 8f812c7cd4f5ccc2964a13fc2c0d588392742e63 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 11 Aug 2016 17:54:06 +0900 Subject: [PATCH] blop --- TODO.md | 2 + docs/tutorial.md | 142 ++++++++++++++++++++++++++++++++-- script/build-static-binary.sh | 3 + src/cli/commands/bench.rs | 4 +- 4 files changed, 142 insertions(+), 9 deletions(-) mode change 100644 => 100755 script/build-static-binary.sh diff --git a/TODO.md b/TODO.md index e622a01a8..74e514acb 100644 --- a/TODO.md +++ b/TODO.md @@ -1,8 +1,10 @@ +position not stored lenient mode for query parser phrase queries masks for union documentation query explain with proper term names +better schema JSON Arc for the schema error management diff --git a/docs/tutorial.md b/docs/tutorial.md index e7e86be25..167fbff40 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -12,7 +12,7 @@ with the articles of English wikipedia in it. There are two ways to get `tantivy`. If you are a rust programmer, you can run `cargo install tantivy`. Alternatively, if you are on `Linux 64bits`, you can download a -static binary here []() +static binary: [binaries/linux_x86_64/](http://fulmicoton.com/tantivy/binaries/linux_x86_64/tantivy) ## Step 2 - creating the index @@ -31,13 +31,141 @@ Our documents will contain * a body * a url -Running +Running `tantivy new` will start a wizard that will help you go through +the definition of the schema of our new index. ```bash - # create the directory - tantivy + tantivy new -i wikipedia-index ``` - - -https://www.dropbox.com/s/wwnfnu441w1ec9p/wiki-articles.json.bz2?dl=0 \ No newline at end of file +When asked answer to the question as follows: + +``` + Creating new index + Let's define it's schema! + + + + New field name ? title + Text or unsigned 32-bit Integer (T/I) ? T + Should the field be stored (Y/N) ? Y + Should the field be indexed (Y/N) ? Y + Should the field be tokenized (Y/N) ? Y + Should the term frequencies (per doc) be in the index (Y/N) ? Y + Should the term positions (per doc) be in the index (Y/N) ? Y + Add another field (Y/N) ? Y + + + + New field name ? body + Text or unsigned 32-bit Integer (T/I) ? T + Should the field be stored (Y/N) ? Y + Should the field be indexed (Y/N) ? Y + Should the field be tokenized (Y/N) ? Y + Should the term frequencies (per doc) be in the index (Y/N) ? Y + Should the term positions (per doc) be in the index (Y/N) ? Y + Add another field (Y/N) ? Y + + + + New field name ? url + Text or unsigned 32-bit Integer (T/I) ? T + Should the field be stored (Y/N) ? Y + Should the field be indexed (Y/N) ? N + Add another field (Y/N) ? N + + [ + { + "variant": "Text", + "fields": [ + "title", + { + "indexing_options": "TokenizedWithFreqAndPosition", + "stored": true + } + ] + }, + { + "variant": "Text", + "fields": [ + "body", + { + "indexing_options": "TokenizedWithFreqAndPosition", + "stored": true + } + ] + }, + { + "variant": "Text", + "fields": [ + "url", + { + "indexing_options": "Unindexed", + "stored": true + } + ] + } + ] + +``` + +If you want to know more about the meaning of these options, you can check out the [schema doc page](http://fulmicoton.com/tantivy/tantivy/schema/index.html). + +The json displayed at the end has been written in `wikipedia-index/meta.json`. + + +# Step 3 - Get the documents to index + +Tantivy's index command offers a way to index a json file. +More accurately, the file must contain one document per line, in a json format. +The structure of this JSON object must match that of our schema definition. + +```json + {"body": "some text", "title": "some title", "url": "http://somedomain.com"} +``` + +You can download a corpus of more than 5 millions articles from wikipedia +formatted in the right format here : [wiki-articles.json (2.34 GB)](https://www.dropbox.com/s/wwnfnu441w1ec9p/wiki-articles.json.bz2?dl=0). +If you are in a rush you can [download 100 articles in the right format here](http://fulmicoton.com/tantivy/tutorial/wiki-articles-first100.json). + +Make sure to uncompress the file + +```bash + bunzip2 wiki-articles.json.bz2 +``` + +# Step 4 - Index the documents. + +The `index` command will index your document. +By default it will use as many threads as there are core on your machine. + +On my computer (8 core Xeon(R) CPU X3450 @ 2.67GHz), it only takes 7 minutes. + +``` + cat /data/wiki-articles | tantivy index -i wikipedia-index +``` + +# Step 5 - Have a look at the index directory + +```bash + ls wikipedia-index +``` + +If you indexed the 5 millions articles, you should see a lot of files, all with the following format +The main file is `meta.json`. + +Our index is in fact divided in segments. Each segment acts as an individual smaller index. +It is named by a uuid. +Each different files is storing a different datastructure for the index. + + +# Step 6 - Serve a search index + +``` + tantivy serve -i wikipedia-index +``` + +You can start a small server with a JSON API to search into wikipedia. +By default, the server is serving on the port `3000`. + + diff --git a/script/build-static-binary.sh b/script/build-static-binary.sh old mode 100644 new mode 100755 index 3fa3b43c0..6b46cda16 --- a/script/build-static-binary.sh +++ b/script/build-static-binary.sh @@ -1,4 +1,7 @@ +#!/usr/bin/env bash + # the musl-tools package must be installed. rustup target add x86_64-unknown-linux-musl cargo build --release --target=x86_64-unknown-linux-musl +cp target/x86_64-unknown-linux-musl/release/tantivy ../tantivy_doc/binaries/tantivy diff --git a/src/cli/commands/bench.rs b/src/cli/commands/bench.rs index 1d378195b..6fee3c607 100644 --- a/src/cli/commands/bench.rs +++ b/src/cli/commands/bench.rs @@ -73,7 +73,7 @@ fn run_bench(index_path: &Path, let timing; { let mut collector = chain().add(&mut top_collector).add(&mut count_collector); - timing = try!(query.search(&searcher, &mut collector).map_err(|e| format!("Failed while searching query {:?}", query_txt))); + timing = try!(query.search(&searcher, &mut collector).map_err(|e| format!("Failed while searching query {:?}.\n\n{:?}", query_txt, e))); } println!("{}\t{}\t{}\t{}", query_txt, num_terms, count_collector.count(), timing.total_time()); } @@ -89,7 +89,7 @@ fn run_bench(index_path: &Path, try!(query.search(&searcher, &mut top_collector).map_err(|e| format!("Failed while retrieving document for query {:?}.\n{:?}", query, e))); let mut timer = TimerTree::new(); { - let h = timer.open("total"); + let _scoped_timer_ = timer.open("total"); for doc_address in top_collector.docs() { searcher.doc(&doc_address).unwrap(); }