mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
Issue/378 (#392)
* Added failing unit test * Closes #378. Handling queries that end up empty after going through the analyzer. * Fixed stop word example
This commit is contained in:
@@ -23,7 +23,6 @@ use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// this example assumes you understand the content in `basic_search`
|
||||
let index_path = TempDir::new("tantivy_stopwords_example_dir")?;
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
// This configures your custom options for how tantivy will
|
||||
@@ -31,36 +30,36 @@ fn main() -> tantivy::Result<()> {
|
||||
// to note is that we are setting the tokenizer to `stoppy`
|
||||
// which will be defined and registered below.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
|
||||
// Our first field is title.
|
||||
schema_builder.add_text_field("title", text_options);
|
||||
|
||||
// Our second field is body.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("body", text_options);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
]));
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
]));
|
||||
|
||||
index.tokenizers().register("stoppy", tokenizer);
|
||||
|
||||
@@ -76,16 +75,16 @@ fn main() -> tantivy::Result<()> {
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
@@ -103,14 +102,9 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
// this will have NO hits because it was filtered out
|
||||
// because the query is run through the analyzer you
|
||||
// actually will get an error here because the query becomes
|
||||
// empty
|
||||
assert!(query_parser.parse_query("the").is_err());
|
||||
|
||||
// this will have hits
|
||||
let query = query_parser.parse_query("is")?;
|
||||
// stop words are applied on the query as well.
|
||||
// The following will be equivalent to `title:frankenstein`
|
||||
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
|
||||
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
|
||||
@@ -124,6 +118,4 @@ fn main() -> tantivy::Result<()> {
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
use tempdir::TempDir;
|
||||
}
|
||||
@@ -20,6 +20,7 @@ use std::str::FromStr;
|
||||
use tokenizer::TokenizerManager;
|
||||
use combine::Parser;
|
||||
use query::EmptyQuery;
|
||||
use query::query_parser::logical_ast::LogicalAST;
|
||||
|
||||
|
||||
/// Possible error that may happen when parsing a query.
|
||||
@@ -58,6 +59,27 @@ impl From<ParseIntError> for QueryParserError {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Recursively remove empty clause from the AST
|
||||
///
|
||||
/// Returns `None` iff the `logical_ast` ended up being empty.
|
||||
fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
|
||||
match logical_ast {
|
||||
LogicalAST::Clause(children) => {
|
||||
let trimmed_children = children.into_iter()
|
||||
.flat_map(|(occur, child)|
|
||||
trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) )
|
||||
.collect::<Vec<_>>();
|
||||
if trimmed_children.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(LogicalAST::Clause(trimmed_children))
|
||||
}
|
||||
},
|
||||
_ => Some(logical_ast),
|
||||
}
|
||||
}
|
||||
|
||||
/// Tantivy's Query parser
|
||||
///
|
||||
/// The language covered by the current parser is extremely simple.
|
||||
@@ -369,14 +391,15 @@ impl QueryParser {
|
||||
asts.push(LogicalAST::Leaf(Box::new(ast)));
|
||||
}
|
||||
}
|
||||
let result_ast = if asts.is_empty() {
|
||||
// this should never happen
|
||||
return Err(QueryParserError::SyntaxError);
|
||||
} else if asts.len() == 1 {
|
||||
asts[0].clone()
|
||||
} else {
|
||||
LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
|
||||
};
|
||||
let result_ast: LogicalAST =
|
||||
if asts.len() == 1 {
|
||||
asts.into_iter().next().unwrap()
|
||||
} else {
|
||||
LogicalAST::Clause(
|
||||
asts.into_iter()
|
||||
.map(|ast| (Occur::Should, ast))
|
||||
.collect())
|
||||
};
|
||||
Ok(result_ast)
|
||||
}
|
||||
UserInputLeaf::All => {
|
||||
@@ -429,19 +452,17 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
|
||||
}
|
||||
|
||||
fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
|
||||
match logical_ast {
|
||||
LogicalAST::Clause(clause) => {
|
||||
if clause.is_empty() {
|
||||
Box::new(EmptyQuery)
|
||||
} else {
|
||||
let occur_subqueries = clause
|
||||
.into_iter()
|
||||
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
|
||||
.collect::<Vec<_>>();
|
||||
Box::new(BooleanQuery::from(occur_subqueries))
|
||||
}
|
||||
}
|
||||
LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal),
|
||||
match trim_ast(logical_ast) {
|
||||
Some(LogicalAST::Clause(trimmed_clause)) => {
|
||||
let occur_subqueries = trimmed_clause
|
||||
.into_iter()
|
||||
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
|
||||
.collect::<Vec<_>>();
|
||||
assert!(!occur_subqueries.is_empty(), "Should not be empty after trimming");
|
||||
Box::new(BooleanQuery::from(occur_subqueries))
|
||||
},
|
||||
Some(LogicalAST::Leaf(trimmed_logical_literal)) => convert_literal_to_query(*trimmed_logical_literal),
|
||||
None => Box::new(EmptyQuery)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -454,12 +475,17 @@ mod test {
|
||||
use schema::Field;
|
||||
use schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
||||
use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT};
|
||||
use tokenizer::SimpleTokenizer;
|
||||
use tokenizer::TokenizerManager;
|
||||
use tokenizer::{Tokenizer, SimpleTokenizer, LowerCaser, StopWordFilter, TokenizerManager};
|
||||
use Index;
|
||||
|
||||
fn make_query_parser() -> QueryParser {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("en_with_stop_words")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
schema_builder.add_i64_field("signed", INT_INDEXED);
|
||||
@@ -468,9 +494,14 @@ mod test {
|
||||
schema_builder.add_text_field("notindexed_u64", STORED);
|
||||
schema_builder.add_text_field("notindexed_i64", STORED);
|
||||
schema_builder.add_text_field("nottokenized", STRING);
|
||||
schema_builder.add_text_field("with_stop_words", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let default_fields = vec![title, text];
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register("en_with_stop_words", SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec!["the".to_string()]))
|
||||
);
|
||||
QueryParser::new(schema, default_fields, tokenizer_manager)
|
||||
}
|
||||
|
||||
@@ -739,6 +770,13 @@ mod test {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_query_parser_not_empty_but_no_tokens() {
|
||||
let query_parser = make_query_parser();
|
||||
assert!(query_parser.parse_query(" !, ").is_ok());
|
||||
assert!(query_parser.parse_query("with_stop_words:the").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
|
||||
Reference in New Issue
Block a user