Added cli

This commit is contained in:
Paul Masurel
2016-08-11 10:48:24 +09:00
parent ca24daae51
commit edbff60d66
7 changed files with 635 additions and 0 deletions

103
src/cli/commands/bench.rs Normal file
View File

@@ -0,0 +1,103 @@
use tantivy::Index;
use tantivy::schema::{Field, Schema};
use tantivy::query::QueryParser;
use tantivy::query::Query;
use std::path::Path;
use tantivy::TimerTree;
use std::io::BufReader;
use std::io::BufRead;
use std::io;
use std::fs::File;
use tantivy::collector::chain;
use tantivy::collector::TopCollector;
use tantivy::collector::CountCollector;
use clap::ArgMatches;
use std::path::PathBuf;
pub fn run_bench_cli(matches: &ArgMatches) -> Result<(), String> {
let index_path = PathBuf::from(matches.value_of("index").unwrap());
let queries_path = PathBuf::from(matches.value_of("queries").unwrap()); // the unwrap is safe as long as it is comming from the main cli.
let num_repeat = try!(value_t!(matches, "num_repeat", usize).map_err(|e|format!("Failed to read num_repeat argument as an integer. {:?}", e)));
run_bench(&index_path, &queries_path, num_repeat).map_err(From::from)
}
fn extract_search_fields(schema: &Schema) -> Vec<Field> {
schema.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| {
field_entry.is_indexed()
})
.map(|(field_id, _)| field_id as u8)
.map(Field)
.collect()
}
fn read_query_file(query_path: &Path) -> io::Result<Vec<String>> {
let query_file: File = try!(File::open(&query_path));
let file = BufReader::new(&query_file);
let mut queries = Vec::new();
for line_res in file.lines() {
let line = try!(line_res);
let query = String::from(line.trim());
queries.push(query);
}
Ok(queries)
}
fn run_bench(index_path: &Path,
query_filepath: &Path,
num_repeat: usize) -> Result<(), String> {
println!("index_path : {:?}", index_path);
println!("Query : {:?}", index_path);
println!("-------------------------------\n\n\n");
let index = try!(Index::open(index_path).map_err(|e| format!("Failed to open index.\n{:?}", e)));
let searcher = try!(index.searcher().map_err(|e| format!("Failed to acquire searcher.\n{:?}", e)));
let default_search_fields: Vec<Field> = extract_search_fields(&index.schema());
let queries = try!(read_query_file(query_filepath).map_err(|e| format!("Failed reading the query file: {}", e)));
let query_parser = QueryParser::new(index.schema(), default_search_fields);
println!("SEARCH\n");
println!("{}\t{}\t{}\t{}", "query", "num_terms", "num hits", "time in microsecs");
for _ in 0..num_repeat {
for query_txt in &queries {
let query = query_parser.parse_query(&query_txt).unwrap();
let num_terms = query.num_terms();
let mut top_collector = TopCollector::with_limit(10);
let mut count_collector = CountCollector::new();
let timing;
{
let mut collector = chain().add(&mut top_collector).add(&mut count_collector);
timing = try!(query.search(&searcher, &mut collector).map_err(|e| format!("Failed while searching query {:?}", query_txt)));
}
println!("{}\t{}\t{}\t{}", query_txt, num_terms, count_collector.count(), timing.total_time());
}
}
println!("\n\nFETCH STORE\n");
println!("{}\t{}", "query", "time in microsecs");
for _ in 0..num_repeat {
for query_txt in &queries {
let query = query_parser.parse_query(&query_txt).unwrap();
let mut top_collector = TopCollector::with_limit(10);
try!(query.search(&searcher, &mut top_collector).map_err(|e| format!("Failed while retrieving document for query {:?}.\n{:?}", query, e)));
let mut timer = TimerTree::new();
{
let h = timer.open("total");
for doc_address in top_collector.docs() {
searcher.doc(&doc_address).unwrap();
}
}
println!("{}\t{}", query_txt, timer.total_time());
}
}
Ok(())
}

96
src/cli/commands/index.rs Normal file
View File

@@ -0,0 +1,96 @@
use std::convert::From;
use std::fs::File;
use std::io;
use std::io::BufRead;
use std::io::BufReader;
use std::io::Read;
use std::path::PathBuf;
use tantivy;
use tantivy::Index;
use time::PreciseTime;
use clap::ArgMatches;
pub fn run_index_cli(argmatch: &ArgMatches) -> tantivy::Result<()> {
let index_directory = PathBuf::from(argmatch.value_of("index").unwrap());
let document_source = {
match argmatch.value_of("file") {
Some(path) => {
DocumentSource::FromFile(PathBuf::from(path))
}
None => DocumentSource::FromPipe,
}
};
run_index(index_directory, document_source)
}
enum DocumentSource {
FromPipe,
FromFile(PathBuf),
}
fn run_index(directory: PathBuf, document_source: DocumentSource) -> tantivy::Result<()> {
let index = try!(Index::open(&directory));
let schema = index.schema();
let mut index_writer = index.writer_with_num_threads(8).unwrap();
let articles = try!(document_source.read());
let mut num_docs = 0;
let mut cur = PreciseTime::now();
let group_count = 10000;
for article_line_res in articles.lines() {
let article_line = article_line_res.unwrap(); // TODO
match schema.parse_document(&article_line) {
Ok(doc) => {
index_writer.add_document(doc).unwrap();
}
Err(err) => {
println!("Failed to add document doc {:?}", err);
}
}
if num_docs > 0 && (num_docs % group_count == 0) {
println!("{} Docs", num_docs);
let new = PreciseTime::now();
let elapsed = cur.to(new);
println!("{:?} docs / hour", group_count * 3600 * 1e6 as u64 / (elapsed.num_microseconds().unwrap() as u64));
cur = new;
}
num_docs += 1;
}
index_writer.wait().unwrap(); // TODO
Ok(())
}
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
pub struct WikiArticle {
pub url: String,
pub title: String,
pub body: String,
}
impl DocumentSource {
fn read(&self,) -> io::Result<BufReader<Box<Read>>> {
Ok(match self {
&DocumentSource::FromPipe => {
BufReader::new(Box::new(io::stdin()))
}
&DocumentSource::FromFile(ref filepath) => {
let read_file = try!(File::open(&filepath));
BufReader::new(Box::new(read_file))
}
})
}
}

View File

25
src/cli/commands/mod.rs Normal file
View File

@@ -0,0 +1,25 @@
mod index;
mod serve;
mod new;
mod merge;
mod bench;
pub use self::new::run_new_cli;
pub use self::index::run_index_cli;
pub use self::serve::run_serve_cli;
pub use self::bench::run_bench_cli;
// pub mod writer;
// pub mod searcher;
// pub mod index;
// pub mod merger;
// mod segment_serializer;
// mod segment_writer;
// mod segment_reader;
// mod segment_id;
// mod segment_component;
// pub use self::segment_component::SegmentComponent;
// pub use self::segment_id::SegmentId;
// pub use self::segment_reader::SegmentReader;

148
src/cli/commands/new.rs Normal file
View File

@@ -0,0 +1,148 @@
use clap::ArgMatches;
use std::convert::From;
use std::path::PathBuf;
use tantivy;
use tantivy::schema::*;
use tantivy::Index;
use std::io;
use ansi_term::Style;
use ansi_term::Colour::{Red, Blue, Green, Black};
use std::io::Write;
use std::ascii::AsciiExt;
use rustc_serialize::json;
pub fn run_new_cli(matches: &ArgMatches) -> tantivy::Result<()> {
let index_directory = PathBuf::from(matches.value_of("index").unwrap());
run_new(index_directory)
}
fn prompt_input<P: Fn(&str) -> Result<(), String>>(prompt_text: &str, predicate: P) -> String {
loop {
print!("{prompt_text:<width$} ? ", prompt_text=Style::new().bold().fg(Blue).paint(prompt_text), width=40);
io::stdout().flush().unwrap();
let mut buffer = String::new();
io::stdin().read_line(&mut buffer).ok().expect("Failed to read line");
let answer = buffer.trim_right_matches("\n").to_string();
match predicate(&answer) {
Ok(()) => {
return answer;
}
Err(msg) => {
println!("Error: {}", Style::new().bold().fg(Red).paint(msg));
}
}
}
}
fn field_name_validate(field_name: &str) -> Result<(), String> {
if is_valid_field_name(field_name) {
Ok(())
}
else {
Err(String::from("Field name must match the pattern [_a-zA-Z0-9]+"))
}
}
fn prompt_options(msg: &str, codes: Vec<char>) -> char {
let options_string: Vec<String> = codes.iter().map(|c| format!("{}", c)).collect();
let options = options_string.join("/");
let predicate = |entry: &str| {
if entry.len() != 1 {
return Err(format!("Invalid input. Options are ({})", options))
}
let c = entry.chars().next().unwrap().to_ascii_uppercase();
if codes.contains(&c) {
return Ok(())
}
else {
return Err(format!("Invalid input. Options are ({})", options))
}
};
let message = format!("{} ({})", msg, options);
let entry = prompt_input(&message, predicate);
entry.chars().next().unwrap().to_ascii_uppercase()
}
fn prompt_yn(msg: &str) -> bool {
prompt_options(msg, vec!('Y', 'N')) == 'Y'
}
fn ask_add_field_text(field_name: &str, schema: &mut Schema) {
let mut text_options = TextOptions::new();
if prompt_yn("Should the field be stored") {
text_options = text_options.set_stored();
}
let is_indexed = prompt_yn("Should the field be indexed");
let indexing_options = if is_indexed {
if prompt_yn("Should the field be tokenized") {
if prompt_yn("Should the term frequencies (per doc) be in the index") {
if prompt_yn("Should the term positions (per doc) be in the index") {
TextIndexingOptions::TokenizedWithFreqAndPosition
}
else {
TextIndexingOptions::TokenizedWithFreq
}
}
else {
TextIndexingOptions::TokenizedNoFreq
}
}
else {
TextIndexingOptions::Unindexed
}
}
else {
TextIndexingOptions::Unindexed
};
text_options = text_options.set_indexing_options(indexing_options);
schema.add_text_field(field_name, text_options);
}
fn ask_add_field_u32(field_name: &str, schema: &mut Schema) {
let mut u32_options = U32Options::new();
if prompt_yn("Should the field be stored") {
u32_options = u32_options.set_stored();
}
if prompt_yn("Should the field be fast") {
u32_options = u32_options.set_fast();
}
if prompt_yn("Should the field be indexed") {
u32_options = u32_options.set_indexed();
}
schema.add_u32_field(field_name, u32_options);
}
fn ask_add_field(schema: &mut Schema) {
println!("\n\n");
let field_name = prompt_input("New field name ", field_name_validate);
let text_or_integer = prompt_options("Text or unsigned 32-bit Integer", vec!('T', 'I'));
if text_or_integer =='T' {
ask_add_field_text(&field_name, schema);
}
else {
ask_add_field_u32(&field_name, schema);
}
}
fn run_new(directory: PathBuf) -> tantivy::Result<()> {
println!("\n{} ", Style::new().bold().fg(Green).paint("Creating new index"));
println!("{} ", Style::new().bold().fg(Green).paint("Let's define it's schema!"));
let mut schema = Schema::new();
loop {
ask_add_field(&mut schema);
if !prompt_yn("Add another field") {
break;
}
}
let schema_json = format!("{}", json::as_pretty_json(&schema));
println!("\n{}\n", Style::new().on(Blue).fg(Black).paint(schema_json));
let mut index = try!(Index::create(&directory, schema));
index.save_metas()
}

162
src/cli/commands/serve.rs Normal file
View File

@@ -0,0 +1,162 @@
use clap::ArgMatches;
use iron::mime::Mime;
use iron::prelude::*;
use iron::status;
use iron::typemap::Key;
use mount::Mount;
use persistent::Read;
use rustc_serialize::json::as_pretty_json;
use staticfile::Static;
use std::convert::From;
use std::path::Path;
use std::path::PathBuf;
use tantivy;
use tantivy::collector;
use tantivy::collector::CountCollector;
use tantivy::collector::TopCollector;
use tantivy::Document;
use tantivy::Index;
use tantivy::query::Explanation;
use tantivy::query::Query;
use tantivy::query::QueryParser;
use tantivy::Result;
use tantivy::schema::Field;
use tantivy::Score;
use urlencoded::UrlEncodedQuery;
pub fn run_serve_cli(matches: &ArgMatches) -> tantivy::Result<()> {
let index_directory = PathBuf::from(matches.value_of("index").unwrap());
let port = value_t!(matches, "port", u16).unwrap_or(3000u16);
let host_str = matches.value_of("host").unwrap_or("localhost");
let host = format!("{}:{}", host_str, port);
run_serve(index_directory, &host)
}
#[derive(RustcDecodable, RustcEncodable)]
struct Serp {
q: String,
num_hits: usize,
hits: Vec<Hit>,
timings: Vec<Timing>,
}
#[derive(RustcDecodable, RustcEncodable)]
struct Hit {
title: String,
body: String,
explain: String,
score: Score,
}
#[derive(RustcDecodable, RustcEncodable)]
struct Timing {
name: String,
duration: i64,
}
struct IndexServer {
index: Index,
query_parser: QueryParser,
body_field: Field,
title_field: Field,
}
impl IndexServer {
fn load(path: &Path) -> IndexServer {
let index = Index::open(path).unwrap();
let schema = index.schema();
let body_field = schema.get_field("body").unwrap();
let title_field = schema.get_field("title").unwrap();
let query_parser = QueryParser::new(schema, vec!(body_field, title_field));
IndexServer {
index: index,
query_parser: query_parser,
title_field: title_field,
body_field: body_field,
}
}
fn create_hit(&self, doc: &Document, explain: Explanation) -> Hit {
Hit {
title: String::from(doc.get_first(self.title_field).unwrap().text()),
body: String::from(doc.get_first(self.body_field).unwrap().text().clone()),
explain: format!("{:?}", explain),
score: explain.val(),
}
}
fn search(&self, q: String) -> Result<Serp> {
let query = self.query_parser.parse_query(&q).unwrap();
let searcher = self.index.searcher().unwrap();
let mut count_collector = CountCollector::new();
let mut top_collector = TopCollector::with_limit(10);
{
let mut chained_collector = collector::chain()
.add(&mut top_collector)
.add(&mut count_collector);
try!(query.search(&searcher, &mut chained_collector));
}
let hits: Vec<Hit> = top_collector.docs()
.iter()
.map(|doc_address| {
let doc: Document = searcher.doc(doc_address).unwrap();
let explanation = query.explain(&searcher, doc_address).unwrap();
self.create_hit(&doc, explanation)
})
.collect();
Ok(Serp {
q: q,
hits: hits,
num_hits: count_collector.count(),
timings: Vec::new(),
})
}
}
impl Key for IndexServer {
type Value = IndexServer;
}
fn search(req: &mut Request) -> IronResult<Response> {
let index_server = req.get::<Read<IndexServer>>().unwrap();
match req.get_ref::<UrlEncodedQuery>() {
Ok(ref qs_map) => {
match qs_map.get("q") {
Some(qs) => {
let query = qs[0].clone();
let serp = index_server.search(query).unwrap();
let resp_json = as_pretty_json(&serp).indent(4);
let content_type = "application/json".parse::<Mime>().unwrap();
Ok(
Response::with((content_type, status::Ok, format!("{}", resp_json)))
)
}
None => {
Ok(Response::with((status::BadRequest, "Query not defined")))
}
}
}
Err(_) => Ok(Response::with((status::BadRequest, "Failed to parse query string")))
}
}
fn run_serve(directory: PathBuf, host: &str) -> tantivy::Result<()> {
let mut mount = Mount::new();
let server = IndexServer::load(&directory);
mount.mount("/api", search);
mount.mount("/", Static::new(Path::new("static/")));
let mut middleware = Chain::new(mount);
middleware.link(Read::<IndexServer>::both(server));
println!("listening on http://{}", host);
Iron::new(middleware).http(host).unwrap();
Ok(())
}

101
src/cli/main.rs Normal file
View File

@@ -0,0 +1,101 @@
#[macro_use]
extern crate clap;
#[macro_use]
extern crate lazy_static;
extern crate rustc_serialize;
extern crate tantivy;
extern crate time;
extern crate persistent;
extern crate urlencoded;
extern crate iron;
extern crate staticfile;
extern crate ansi_term;
extern crate mount;
use clap::{AppSettings, Arg, App, SubCommand};
mod commands;
use self::commands::*;
fn main() {
let index_arg = Arg::with_name("index")
.short("i")
.long("index")
.value_name("directory")
.help("Tantivy index directory filepath")
.required(true);
let cli_options = App::new("Tantivy")
.setting(AppSettings::SubcommandRequiredElseHelp)
.version("0.1")
.author("Paul Masurel <paul.masurel@gmail.com>")
.about("Tantivy Search Engine's command line interface.")
.subcommand(
SubCommand::with_name("new")
.about("Create a new index. The schema will be populated with a simple example schema")
.arg(index_arg.clone())
)
.subcommand(
SubCommand::with_name("serve")
.about("Start a server")
.arg(index_arg.clone())
.arg(Arg::with_name("host")
.long("host")
.value_name("host")
.help("host to listen to")
)
.arg(Arg::with_name("port")
.short("p")
.long("port")
.value_name("port")
.help("Port")
.default_value("localhost")
)
)
.subcommand(
SubCommand::with_name("index")
.about("Index files")
.arg(index_arg.clone())
.arg(Arg::with_name("file")
.short("f")
.long("file")
.value_name("file")
.help("File containing the documents to index."))
)
.subcommand(
SubCommand::with_name("bench")
.about("Run a benchmark on your index")
.arg(index_arg.clone())
.arg(Arg::with_name("queries")
.short("q")
.long("queries")
.value_name("queries")
.help("File containing queries (one-per line) to run in the benchmark.")
.required(true))
.arg(Arg::with_name("num_repeat")
.short("n")
.long("num_repeat")
.value_name("num_repeat")
.help("Number of time to repeat the benchmark.")
.default_value("1"))
)
.get_matches();
let (subcommand, some_options) = cli_options.subcommand();
let options = some_options.unwrap();
match subcommand {
"new" => run_new_cli(options).unwrap(),
"index" => run_index_cli(options).unwrap(),
"serve" => run_serve_cli(options).unwrap(),
"bench" => {
let res = run_bench_cli(options);
match res {
Err(e) => { println!("{}", e);}
_ => {}
}
},
_ => {}
}
}