diff --git a/examples/stop_words.rs b/examples/stop_words.rs index 950a42afd..b131d876c 100644 --- a/examples/stop_words.rs +++ b/examples/stop_words.rs @@ -23,7 +23,6 @@ use tantivy::Index; fn main() -> tantivy::Result<()> { // this example assumes you understand the content in `basic_search` - let index_path = TempDir::new("tantivy_stopwords_example_dir")?; let mut schema_builder = SchemaBuilder::default(); // This configures your custom options for how tantivy will @@ -31,36 +30,36 @@ fn main() -> tantivy::Result<()> { // to note is that we are setting the tokenizer to `stoppy` // which will be defined and registered below. let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("stoppy") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); + .set_tokenizer("stoppy") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); let text_options = TextOptions::default() - .set_indexing_options(text_field_indexing) - .set_stored(); + .set_indexing_options(text_field_indexing) + .set_stored(); // Our first field is title. schema_builder.add_text_field("title", text_options); // Our second field is body. let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("stoppy") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); + .set_tokenizer("stoppy") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); let text_options = TextOptions::default() - .set_indexing_options(text_field_indexing) - .set_stored(); + .set_indexing_options(text_field_indexing) + .set_stored(); schema_builder.add_text_field("body", text_options); let schema = schema_builder.build(); - let index = Index::create_in_dir(&index_path, schema.clone())?; + let index = Index::create_in_ram(schema.clone()); // This tokenizer lowers all of the text (to help with stop word matching) // then removes all instances of `the` and `and` from the corpus let tokenizer = SimpleTokenizer - .filter(LowerCaser) - .filter(StopWordFilter::remove(vec![ - "the".to_string(), - "and".to_string(), - ])); + .filter(LowerCaser) + .filter(StopWordFilter::remove(vec![ + "the".to_string(), + "and".to_string(), + ])); index.tokenizers().register("stoppy", tokenizer); @@ -76,16 +75,16 @@ fn main() -> tantivy::Result<()> { )); index_writer.add_document(doc!( - title => "Of Mice and Men", - body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ - bank and runs deep and green. The water is warm too, for it has slipped twinkling \ - over the yellow sands in the sunlight before reaching the narrow pool. On one \ - side of the river the golden foothill slopes curve up to the strong and rocky \ - Gabilan Mountains, but on the valley side the water is lined with trees—willows \ - fresh and green with every spring, carrying in their lower leaf junctures the \ - debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ - limbs and branches that arch over the pool" - )); + title => "Of Mice and Men", + body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ + bank and runs deep and green. The water is warm too, for it has slipped twinkling \ + over the yellow sands in the sunlight before reaching the narrow pool. On one \ + side of the river the golden foothill slopes curve up to the strong and rocky \ + Gabilan Mountains, but on the valley side the water is lined with trees—willows \ + fresh and green with every spring, carrying in their lower leaf junctures the \ + debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ + limbs and branches that arch over the pool" + )); index_writer.add_document(doc!( title => "Frankenstein", @@ -103,14 +102,9 @@ fn main() -> tantivy::Result<()> { let query_parser = QueryParser::for_index(&index, vec![title, body]); - // this will have NO hits because it was filtered out - // because the query is run through the analyzer you - // actually will get an error here because the query becomes - // empty - assert!(query_parser.parse_query("the").is_err()); - - // this will have hits - let query = query_parser.parse_query("is")?; + // stop words are applied on the query as well. + // The following will be equivalent to `title:frankenstein` + let query = query_parser.parse_query("title:\"the Frankenstein\"")?; let mut top_collector = TopCollector::with_limit(10); @@ -124,6 +118,4 @@ fn main() -> tantivy::Result<()> { } Ok(()) -} - -use tempdir::TempDir; +} \ No newline at end of file diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index f3a9f37c0..93deb48c1 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -20,6 +20,7 @@ use std::str::FromStr; use tokenizer::TokenizerManager; use combine::Parser; use query::EmptyQuery; +use query::query_parser::logical_ast::LogicalAST; /// Possible error that may happen when parsing a query. @@ -58,6 +59,27 @@ impl From for QueryParserError { } } + +/// Recursively remove empty clause from the AST +/// +/// Returns `None` iff the `logical_ast` ended up being empty. +fn trim_ast(logical_ast: LogicalAST) -> Option { + match logical_ast { + LogicalAST::Clause(children) => { + let trimmed_children = children.into_iter() + .flat_map(|(occur, child)| + trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) ) + .collect::>(); + if trimmed_children.is_empty() { + None + } else { + Some(LogicalAST::Clause(trimmed_children)) + } + }, + _ => Some(logical_ast), + } +} + /// Tantivy's Query parser /// /// The language covered by the current parser is extremely simple. @@ -369,14 +391,15 @@ impl QueryParser { asts.push(LogicalAST::Leaf(Box::new(ast))); } } - let result_ast = if asts.is_empty() { - // this should never happen - return Err(QueryParserError::SyntaxError); - } else if asts.len() == 1 { - asts[0].clone() - } else { - LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect()) - }; + let result_ast: LogicalAST = + if asts.len() == 1 { + asts.into_iter().next().unwrap() + } else { + LogicalAST::Clause( + asts.into_iter() + .map(|ast| (Occur::Should, ast)) + .collect()) + }; Ok(result_ast) } UserInputLeaf::All => { @@ -429,19 +452,17 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box { } fn convert_to_query(logical_ast: LogicalAST) -> Box { - match logical_ast { - LogicalAST::Clause(clause) => { - if clause.is_empty() { - Box::new(EmptyQuery) - } else { - let occur_subqueries = clause - .into_iter() - .map(|(occur, subquery)| (occur, convert_to_query(subquery))) - .collect::>(); - Box::new(BooleanQuery::from(occur_subqueries)) - } - } - LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal), + match trim_ast(logical_ast) { + Some(LogicalAST::Clause(trimmed_clause)) => { + let occur_subqueries = trimmed_clause + .into_iter() + .map(|(occur, subquery)| (occur, convert_to_query(subquery))) + .collect::>(); + assert!(!occur_subqueries.is_empty(), "Should not be empty after trimming"); + Box::new(BooleanQuery::from(occur_subqueries)) + }, + Some(LogicalAST::Leaf(trimmed_logical_literal)) => convert_literal_to_query(*trimmed_logical_literal), + None => Box::new(EmptyQuery) } } @@ -454,12 +475,17 @@ mod test { use schema::Field; use schema::{IndexRecordOption, TextFieldIndexing, TextOptions}; use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT}; - use tokenizer::SimpleTokenizer; - use tokenizer::TokenizerManager; + use tokenizer::{Tokenizer, SimpleTokenizer, LowerCaser, StopWordFilter, TokenizerManager}; use Index; fn make_query_parser() -> QueryParser { let mut schema_builder = SchemaBuilder::default(); + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("en_with_stop_words") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let text_options = TextOptions::default() + .set_indexing_options(text_field_indexing) + .set_stored(); let title = schema_builder.add_text_field("title", TEXT); let text = schema_builder.add_text_field("text", TEXT); schema_builder.add_i64_field("signed", INT_INDEXED); @@ -468,9 +494,14 @@ mod test { schema_builder.add_text_field("notindexed_u64", STORED); schema_builder.add_text_field("notindexed_i64", STORED); schema_builder.add_text_field("nottokenized", STRING); + schema_builder.add_text_field("with_stop_words", text_options); let schema = schema_builder.build(); let default_fields = vec![title, text]; let tokenizer_manager = TokenizerManager::default(); + tokenizer_manager.register("en_with_stop_words", SimpleTokenizer + .filter(LowerCaser) + .filter(StopWordFilter::remove(vec!["the".to_string()])) + ); QueryParser::new(schema, default_fields, tokenizer_manager) } @@ -739,6 +770,13 @@ mod test { ); } + #[test] + pub fn test_query_parser_not_empty_but_no_tokens() { + let query_parser = make_query_parser(); + assert!(query_parser.parse_query(" !, ").is_ok()); + assert!(query_parser.parse_query("with_stop_words:the").is_ok()); + } + #[test] pub fn test_parse_query_to_ast_conjunction() { test_parse_query_to_logical_ast_helper(