From 4773c8581c28c231bef55f8ff6384f2d09aee1d3 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 14 Aug 2016 23:08:17 +0900 Subject: [PATCH] Moved cli to another project --- Cargo.toml | 14 ---- src/cli/commands/bench.rs | 103 ------------------------ src/cli/commands/index.rs | 101 ------------------------ src/cli/commands/merge.rs | 18 ----- src/cli/commands/mod.rs | 11 --- src/cli/commands/new.rs | 148 ----------------------------------- src/cli/commands/serve.rs | 160 -------------------------------------- src/cli/main.rs | 113 --------------------------- src/query/explanation.rs | 2 +- src/schema/document.rs | 2 - 10 files changed, 1 insertion(+), 671 deletions(-) delete mode 100644 src/cli/commands/bench.rs delete mode 100644 src/cli/commands/index.rs delete mode 100644 src/cli/commands/merge.rs delete mode 100644 src/cli/commands/mod.rs delete mode 100644 src/cli/commands/new.rs delete mode 100644 src/cli/commands/serve.rs delete mode 100644 src/cli/main.rs diff --git a/Cargo.toml b/Cargo.toml index d206bab85..e60e4be59 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,12 +33,6 @@ itertools = "0.4.16" lz4 = "1.13.131" time = "0.1.34" uuid = "0.1" -iron = "0.4" -staticfile = "0.3.0" -persistent="0.2.0" -ansi_term = "0.8.0" -clap = "2" -mount = "0.2.1" [dev-dependencies] rand = "0.3.13" @@ -46,15 +40,7 @@ rand = "0.3.13" [build-dependencies] gcc = "0.3.24" -[[bin]] -name = "tantivy-cli" -path = "src/cli/main.rs" -doc = false - # [profile.release] # debug = true -[dependencies.urlencoded] -version = "0.4" - diff --git a/src/cli/commands/bench.rs b/src/cli/commands/bench.rs deleted file mode 100644 index 6fee3c607..000000000 --- a/src/cli/commands/bench.rs +++ /dev/null @@ -1,103 +0,0 @@ -use tantivy::Index; -use tantivy::schema::{Field, Schema}; -use tantivy::query::QueryParser; -use tantivy::query::Query; -use std::path::Path; -use tantivy::TimerTree; -use std::io::BufReader; -use std::io::BufRead; -use std::io; -use std::fs::File; -use tantivy::collector::chain; -use tantivy::collector::TopCollector; -use tantivy::collector::CountCollector; -use clap::ArgMatches; -use std::path::PathBuf; - - -pub fn run_bench_cli(matches: &ArgMatches) -> Result<(), String> { - let index_path = PathBuf::from(matches.value_of("index").unwrap()); - let queries_path = PathBuf::from(matches.value_of("queries").unwrap()); // the unwrap is safe as long as it is comming from the main cli. - let num_repeat = try!(value_t!(matches, "num_repeat", usize).map_err(|e|format!("Failed to read num_repeat argument as an integer. {:?}", e))); - run_bench(&index_path, &queries_path, num_repeat).map_err(From::from) -} - - -fn extract_search_fields(schema: &Schema) -> Vec { - schema.fields() - .iter() - .enumerate() - .filter(|&(_, field_entry)| { - field_entry.is_indexed() - }) - .map(|(field_id, _)| field_id as u8) - .map(Field) - .collect() -} - -fn read_query_file(query_path: &Path) -> io::Result> { - let query_file: File = try!(File::open(&query_path)); - let file = BufReader::new(&query_file); - let mut queries = Vec::new(); - for line_res in file.lines() { - let line = try!(line_res); - let query = String::from(line.trim()); - queries.push(query); - } - Ok(queries) -} - - -fn run_bench(index_path: &Path, - query_filepath: &Path, - num_repeat: usize) -> Result<(), String> { - - println!("index_path : {:?}", index_path); - println!("Query : {:?}", index_path); - println!("-------------------------------\n\n\n"); - - let index = try!(Index::open(index_path).map_err(|e| format!("Failed to open index.\n{:?}", e))); - let searcher = try!(index.searcher().map_err(|e| format!("Failed to acquire searcher.\n{:?}", e))); - let default_search_fields: Vec = extract_search_fields(&index.schema()); - let queries = try!(read_query_file(query_filepath).map_err(|e| format!("Failed reading the query file: {}", e))); - let query_parser = QueryParser::new(index.schema(), default_search_fields); - - println!("SEARCH\n"); - println!("{}\t{}\t{}\t{}", "query", "num_terms", "num hits", "time in microsecs"); - for _ in 0..num_repeat { - for query_txt in &queries { - let query = query_parser.parse_query(&query_txt).unwrap(); - let num_terms = query.num_terms(); - let mut top_collector = TopCollector::with_limit(10); - let mut count_collector = CountCollector::new(); - let timing; - { - let mut collector = chain().add(&mut top_collector).add(&mut count_collector); - timing = try!(query.search(&searcher, &mut collector).map_err(|e| format!("Failed while searching query {:?}.\n\n{:?}", query_txt, e))); - } - println!("{}\t{}\t{}\t{}", query_txt, num_terms, count_collector.count(), timing.total_time()); - } - } - - - println!("\n\nFETCH STORE\n"); - println!("{}\t{}", "query", "time in microsecs"); - for _ in 0..num_repeat { - for query_txt in &queries { - let query = query_parser.parse_query(&query_txt).unwrap(); - let mut top_collector = TopCollector::with_limit(10); - try!(query.search(&searcher, &mut top_collector).map_err(|e| format!("Failed while retrieving document for query {:?}.\n{:?}", query, e))); - let mut timer = TimerTree::new(); - { - let _scoped_timer_ = timer.open("total"); - for doc_address in top_collector.docs() { - searcher.doc(&doc_address).unwrap(); - } - } - println!("{}\t{}", query_txt, timer.total_time()); - } - } - - Ok(()) -} - diff --git a/src/cli/commands/index.rs b/src/cli/commands/index.rs deleted file mode 100644 index 35895a762..000000000 --- a/src/cli/commands/index.rs +++ /dev/null @@ -1,101 +0,0 @@ -use std::convert::From; -use std::fs::File; -use std::io; -use std::io::BufRead; -use std::io::BufReader; -use std::io::Read; -use std::path::PathBuf; -use tantivy; -use tantivy::Index; -use time::PreciseTime; -use clap::ArgMatches; - - -pub fn run_index_cli(argmatch: &ArgMatches) -> Result<(), String> { - let index_directory = PathBuf::from(argmatch.value_of("index").unwrap()); - let document_source = { - match argmatch.value_of("file") { - Some(path) => { - DocumentSource::FromFile(PathBuf::from(path)) - } - None => DocumentSource::FromPipe, - } - }; - let num_threads = try!(value_t!(argmatch, "num_threads", usize).map_err(|_|format!("Failed to read num_threads argument as an integer."))); - run_index(index_directory, document_source, num_threads).map_err(|e| format!("Indexing failed : {:?}", e)) -} - -enum DocumentSource { - FromPipe, - FromFile(PathBuf), -} - -fn run_index(directory: PathBuf, document_source: DocumentSource, num_threads: usize) -> tantivy::Result<()> { - - let index = try!(Index::open(&directory)); - - let schema = index.schema(); - - let mut index_writer = try!( - if num_threads > 0 { - index.writer_with_num_threads(num_threads) - } - else { - index.writer() - } - ); - - let articles = try!(document_source.read()); - - let mut num_docs = 0; - let mut cur = PreciseTime::now(); - let group_count = 100000; - - for article_line_res in articles.lines() { - let article_line = article_line_res.unwrap(); // TODO - match schema.parse_document(&article_line) { - Ok(doc) => { - index_writer.add_document(doc).unwrap(); - } - Err(err) => { - println!("Failed to add document doc {:?}", err); - } - } - if num_docs > 0 && (num_docs % group_count == 0) { - println!("{} Docs", num_docs); - let new = PreciseTime::now(); - let elapsed = cur.to(new); - println!("{:?} docs / hour", group_count * 3600 * 1_000_000 as u64 / (elapsed.num_microseconds().unwrap() as u64)); - cur = new; - } - - num_docs += 1; - - } - index_writer.wait().unwrap(); // TODO - Ok(()) -} - - -#[derive(Clone,Debug,RustcDecodable,RustcEncodable)] -pub struct WikiArticle { - pub url: String, - pub title: String, - pub body: String, -} - - -impl DocumentSource { - fn read(&self,) -> io::Result>> { - Ok(match self { - &DocumentSource::FromPipe => { - BufReader::new(Box::new(io::stdin())) - } - &DocumentSource::FromFile(ref filepath) => { - let read_file = try!(File::open(&filepath)); - BufReader::new(Box::new(read_file)) - } - }) - } -} - diff --git a/src/cli/commands/merge.rs b/src/cli/commands/merge.rs deleted file mode 100644 index db61e4acf..000000000 --- a/src/cli/commands/merge.rs +++ /dev/null @@ -1,18 +0,0 @@ -extern crate tantivy; - -use tantivy::Index; -use std::path::PathBuf; -use clap::ArgMatches; - -pub fn run_merge_cli(argmatch: &ArgMatches) -> Result<(), String> { - let index_directory = PathBuf::from(argmatch.value_of("index").unwrap()); - run_merge(index_directory).map_err(|e| format!("Indexing failed : {:?}", e)) -} - - -fn run_merge(path: PathBuf) -> tantivy::Result<()> { - let index = try!(Index::open(&path)); - let segments = index.segments(); - let mut index_writer = try!(index.writer()); - index_writer.merge(&segments) -} diff --git a/src/cli/commands/mod.rs b/src/cli/commands/mod.rs deleted file mode 100644 index fc300536a..000000000 --- a/src/cli/commands/mod.rs +++ /dev/null @@ -1,11 +0,0 @@ -mod index; -mod serve; -mod new; -mod bench; -mod merge; - -pub use self::new::run_new_cli; -pub use self::index::run_index_cli; -pub use self::serve::run_serve_cli; -pub use self::bench::run_bench_cli; -pub use self::merge::run_merge_cli; diff --git a/src/cli/commands/new.rs b/src/cli/commands/new.rs deleted file mode 100644 index 64ba9201b..000000000 --- a/src/cli/commands/new.rs +++ /dev/null @@ -1,148 +0,0 @@ -use clap::ArgMatches; -use std::convert::From; -use std::path::PathBuf; -use tantivy; -use tantivy::schema::*; -use tantivy::Index; -use std::io; -use ansi_term::Style; -use ansi_term::Colour::{Red, Blue, Green}; -use std::io::Write; -use std::ascii::AsciiExt; -use rustc_serialize::json; - - -pub fn run_new_cli(matches: &ArgMatches) -> tantivy::Result<()> { - let index_directory = PathBuf::from(matches.value_of("index").unwrap()); - run_new(index_directory) -} - - -fn prompt_input Result<(), String>>(prompt_text: &str, predicate: P) -> String { - loop { - print!("{prompt_text: { - return answer; - } - Err(msg) => { - println!("Error: {}", Style::new().bold().fg(Red).paint(msg)); - } - } - } -} - - -fn field_name_validate(field_name: &str) -> Result<(), String> { - if is_valid_field_name(field_name) { - Ok(()) - } - else { - Err(String::from("Field name must match the pattern [_a-zA-Z0-9]+")) - } -} - - -fn prompt_options(msg: &str, codes: Vec) -> char { - let options_string: Vec = codes.iter().map(|c| format!("{}", c)).collect(); - let options = options_string.join("/"); - let predicate = |entry: &str| { - if entry.len() != 1 { - return Err(format!("Invalid input. Options are ({})", options)) - } - let c = entry.chars().next().unwrap().to_ascii_uppercase(); - if codes.contains(&c) { - return Ok(()) - } - else { - return Err(format!("Invalid input. Options are ({})", options)) - } - }; - let message = format!("{} ({})", msg, options); - let entry = prompt_input(&message, predicate); - entry.chars().next().unwrap().to_ascii_uppercase() -} - -fn prompt_yn(msg: &str) -> bool { - prompt_options(msg, vec!('Y', 'N')) == 'Y' -} - - -fn ask_add_field_text(field_name: &str, schema: &mut Schema) { - let mut text_options = TextOptions::new(); - if prompt_yn("Should the field be stored") { - text_options = text_options.set_stored(); - } - let is_indexed = prompt_yn("Should the field be indexed"); - let indexing_options = if is_indexed { - if prompt_yn("Should the field be tokenized") { - if prompt_yn("Should the term frequencies (per doc) be in the index") { - if prompt_yn("Should the term positions (per doc) be in the index") { - TextIndexingOptions::TokenizedWithFreqAndPosition - } - else { - TextIndexingOptions::TokenizedWithFreq - } - } - else { - TextIndexingOptions::TokenizedNoFreq - } - } - else { - TextIndexingOptions::Unindexed - } - } - else { - TextIndexingOptions::Unindexed - }; - text_options = text_options.set_indexing_options(indexing_options); - schema.add_text_field(field_name, text_options); -} - - -fn ask_add_field_u32(field_name: &str, schema: &mut Schema) { - let mut u32_options = U32Options::new(); - if prompt_yn("Should the field be stored") { - u32_options = u32_options.set_stored(); - } - if prompt_yn("Should the field be fast") { - u32_options = u32_options.set_fast(); - } - if prompt_yn("Should the field be indexed") { - u32_options = u32_options.set_indexed(); - } - schema.add_u32_field(field_name, u32_options); -} - -fn ask_add_field(schema: &mut Schema) { - println!("\n\n"); - let field_name = prompt_input("New field name ", field_name_validate); - let text_or_integer = prompt_options("Text or unsigned 32-bit Integer", vec!('T', 'I')); - if text_or_integer =='T' { - ask_add_field_text(&field_name, schema); - } - else { - ask_add_field_u32(&field_name, schema); - } -} - -fn run_new(directory: PathBuf) -> tantivy::Result<()> { - println!("\n{} ", Style::new().bold().fg(Green).paint("Creating new index")); - println!("{} ", Style::new().bold().fg(Green).paint("Let's define it's schema!")); - let mut schema = Schema::new(); - loop { - ask_add_field(&mut schema); - if !prompt_yn("Add another field") { - break; - } - } - let schema_json = format!("{}", json::as_pretty_json(&schema)); - println!("\n{}\n", Style::new().fg(Green).paint(schema_json)); - let mut index = try!(Index::create(&directory, schema)); - index.save_metas() -} - diff --git a/src/cli/commands/serve.rs b/src/cli/commands/serve.rs deleted file mode 100644 index 809ee4c58..000000000 --- a/src/cli/commands/serve.rs +++ /dev/null @@ -1,160 +0,0 @@ -use clap::ArgMatches; -use iron::mime::Mime; -use iron::prelude::*; -use iron::status; -use iron::typemap::Key; -use mount::Mount; -use persistent::Read; -use rustc_serialize::json::as_pretty_json; -use rustc_serialize::json::Json; -use staticfile::Static; -use std::convert::From; -use std::path::Path; -use std::path::PathBuf; -use tantivy; -use tantivy::collector; -use tantivy::collector::CountCollector; -use tantivy::collector::TopCollector; -use tantivy::Document; -use tantivy::Index; -use tantivy::query::Explanation; -use tantivy::query::Query; -use tantivy::query::QueryParser; -use tantivy::Result; -use tantivy::schema::{Field, Schema}; -use tantivy::Score; -use tantivy::schema::NamedFieldDocument; -use urlencoded::UrlEncodedQuery; - - -pub fn run_serve_cli(matches: &ArgMatches) -> tantivy::Result<()> { - let index_directory = PathBuf::from(matches.value_of("index").unwrap()); - let port = value_t!(matches, "port", u16).unwrap_or(3000u16); - let host_str = matches.value_of("host").unwrap_or("localhost"); - let host = format!("{}:{}", host_str, port); - run_serve(index_directory, &host) -} - - -#[derive(RustcEncodable)] -struct Serp { - q: String, - num_hits: usize, - hits: Vec, - timings: Vec, -} - -#[derive(RustcEncodable)] -struct Hit { - doc: NamedFieldDocument, - explain: String, - score: Score, -} - -#[derive(RustcEncodable)] -struct Timing { - name: String, - duration: i64, -} - -struct IndexServer { - index: Index, - query_parser: QueryParser, - schema: Schema, -} - -impl IndexServer { - - fn load(path: &Path) -> IndexServer { - let index = Index::open(path).unwrap(); - let schema = index.schema(); - let body_field = schema.get_field("body").unwrap(); - let title_field = schema.get_field("title").unwrap(); - let query_parser = QueryParser::new(schema.clone(), vec!(body_field, title_field)); - IndexServer { - index: index, - query_parser: query_parser, - schema: schema, - } - } - - fn create_hit(&self, doc: &Document, explain: Explanation) -> Hit { - Hit { - doc: self.index.schema().to_named_doc(&doc), - explain: format!("{:?}", explain), - score: explain.val(), - } - } - - fn search(&self, q: String) -> Result { - let query = self.query_parser.parse_query(&q).unwrap(); - let searcher = self.index.searcher().unwrap(); - let mut count_collector = CountCollector::new(); - let mut top_collector = TopCollector::with_limit(10); - - { - let mut chained_collector = collector::chain() - .add(&mut top_collector) - .add(&mut count_collector); - try!(query.search(&searcher, &mut chained_collector)); - } - let hits: Vec = top_collector.docs() - .iter() - .map(|doc_address| { - let doc: Document = searcher.doc(doc_address).unwrap(); - let explanation = query.explain(&searcher, doc_address).unwrap(); - self.create_hit(&doc, explanation) - }) - .collect(); - Ok(Serp { - q: q, - hits: hits, - num_hits: count_collector.count(), - timings: Vec::new(), - }) - } -} - -impl Key for IndexServer { - type Value = IndexServer; -} - -fn search(req: &mut Request) -> IronResult { - let index_server = req.get::>().unwrap(); - match req.get_ref::() { - Ok(ref qs_map) => { - match qs_map.get("q") { - Some(qs) => { - let query = qs[0].clone(); - let serp = index_server.search(query).unwrap(); - let resp_json = as_pretty_json(&serp).indent(4); - let content_type = "application/json".parse::().unwrap(); - Ok( - Response::with((content_type, status::Ok, format!("{}", resp_json))) - ) - } - None => { - Ok(Response::with((status::BadRequest, "Query not defined"))) - } - } - } - Err(_) => Ok(Response::with((status::BadRequest, "Failed to parse query string"))) - } -} - - -fn run_serve(directory: PathBuf, host: &str) -> tantivy::Result<()> { - let mut mount = Mount::new(); - let server = IndexServer::load(&directory); - - mount.mount("/api", search); - mount.mount("/", Static::new(Path::new("static/"))); - - let mut middleware = Chain::new(mount); - middleware.link(Read::::both(server)); - - println!("listening on http://{}", host); - Iron::new(middleware).http(host).unwrap(); - Ok(()) -} - diff --git a/src/cli/main.rs b/src/cli/main.rs deleted file mode 100644 index 6d02e795a..000000000 --- a/src/cli/main.rs +++ /dev/null @@ -1,113 +0,0 @@ -#[macro_use] -extern crate clap; -#[macro_use] -extern crate lazy_static; -extern crate rustc_serialize; -extern crate tantivy; -extern crate time; -extern crate persistent; -extern crate urlencoded; -extern crate iron; -extern crate staticfile; -extern crate ansi_term; -extern crate mount; - -use clap::{AppSettings, Arg, App, SubCommand}; -mod commands; -use self::commands::*; - - -fn main() { - let index_arg = Arg::with_name("index") - .short("i") - .long("index") - .value_name("directory") - .help("Tantivy index directory filepath") - .required(true); - - let cli_options = App::new("Tantivy") - .setting(AppSettings::SubcommandRequiredElseHelp) - .version("0.1") - .author("Paul Masurel ") - .about("Tantivy Search Engine's command line interface.") - .subcommand( - SubCommand::with_name("new") - .about("Create a new index. The schema will be populated with a simple example schema") - .arg(index_arg.clone()) - ) - .subcommand( - SubCommand::with_name("serve") - .about("Start a server") - .arg(index_arg.clone()) - .arg(Arg::with_name("host") - .long("host") - .value_name("host") - .help("host to listen to") - ) - .arg(Arg::with_name("port") - .short("p") - .long("port") - .value_name("port") - .help("Port") - .default_value("localhost") - ) - ) - .subcommand( - SubCommand::with_name("index") - .about("Index files") - .arg(index_arg.clone()) - .arg(Arg::with_name("file") - .short("f") - .long("file") - .value_name("file") - .help("File containing the documents to index.")) - .arg(Arg::with_name("num_threads") - .short("t") - .long("num_threads") - .value_name("num_threads") - .help("Number of indexing thread. By default num cores - 1 will be used") - .default_value("0")) - ) - .subcommand( - SubCommand::with_name("bench") - .about("Run a benchmark on your index") - .arg(index_arg.clone()) - .arg(Arg::with_name("queries") - .short("q") - .long("queries") - .value_name("queries") - .help("File containing queries (one-per line) to run in the benchmark.") - .required(true)) - .arg(Arg::with_name("num_repeat") - .short("n") - .long("num_repeat") - .value_name("num_repeat") - .help("Number of time to repeat the benchmark.") - .default_value("1")) - ) - .subcommand( - SubCommand::with_name("merge") - .about("Merge all the segments of an index") - .arg(index_arg.clone()) - ) - .get_matches(); - - let (subcommand, some_options) = cli_options.subcommand(); - - let options = some_options.unwrap(); - - match subcommand { - "new" => run_new_cli(options).unwrap(), - "index" => run_index_cli(options).unwrap(), - "serve" => run_serve_cli(options).unwrap(), - "merge" => run_merge_cli(options).unwrap(), - "bench" => { - let res = run_bench_cli(options); - match res { - Err(e) => { println!("{}", e);} - _ => {} - } - }, - _ => {} - } -} \ No newline at end of file diff --git a/src/query/explanation.rs b/src/query/explanation.rs index 4ebe86df5..ab7f98ff9 100644 --- a/src/query/explanation.rs +++ b/src/query/explanation.rs @@ -1,7 +1,7 @@ use std::fmt; use std::iter; -#[derive(RustcDecodable)] +#[derive(RustcEncodable)] pub struct Explanation { val: f32, description: String, diff --git a/src/schema/document.rs b/src/schema/document.rs index 895a0c9c5..2c32bad17 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -97,8 +97,6 @@ impl From> for Document { #[cfg(test)] mod tests { - - use super::*; use schema::Schema; use schema::TEXT;