mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
Expose phrase-prefix queries via the built-in query parser (#2044)
* Expose phrase-prefix queries via the built-in query parser This proposes the less-than-imaginative syntax `field:"phrase ter"*` to perform a phrase prefix query against `field` using `phrase` and `ter` as the terms. The aim of this is to make this type of query more discoverable and simplify manual testing. I did consider exposing the `max_expansions` parameter similar to how slop is handled, but I think that this is rather something that should be configured via the querser parser (similar to `set_field_boost` and `set_field_fuzzy`) as choosing it requires rather intimiate knowledge of the backing index. * Prevent construction of zero or one term phrase-prefix queries via the query parser. * Add example using phrase-prefix search via surface API to improve feature discoverability.
This commit is contained in:
79
examples/phrase_prefix_search.rs
Normal file
79
examples/phrase_prefix_search.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index, ReloadPolicy, Result};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let index_path = TempDir::new()?;
|
||||
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("title", TEXT | STORED);
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let index = Index::create_in_dir(&index_path, schema)?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone \
|
||||
eighty-four days now without taking a fish.",
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
))?;
|
||||
|
||||
// Multivalued field just need to be repeated.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
title => "The Modern Prometheus",
|
||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
))?;
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()?;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
// This will match documents containing the phrase "in the"
|
||||
// followed by some word starting with "su",
|
||||
// i.e. it will match "in the sunlight" and "in the success",
|
||||
// but not "in the Gulf Stream".
|
||||
let query = query_parser.parse_query("\"in the su\"*")?;
|
||||
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
let mut titles = top_docs
|
||||
.into_iter()
|
||||
.map(|(_score, doc_address)| {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
let title = doc.get_first(title).unwrap().as_text().unwrap().to_owned();
|
||||
Ok(title)
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
titles.sort_unstable();
|
||||
assert_eq!(titles, ["Frankenstein", "Of Mice and Men"]);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -162,14 +162,22 @@ fn term_val<'a>() -> impl Parser<&'a str, Output = (Delimiter, String)> {
|
||||
}
|
||||
|
||||
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
|
||||
(field_name(), term_val(), slop_val()).map(|(field_name, (delimiter, phrase), slop)| {
|
||||
UserInputLiteral {
|
||||
(field_name(), term_val(), slop_or_prefix_val()).map(
|
||||
|(field_name, (delimiter, phrase), (slop, prefix))| UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase,
|
||||
delimiter,
|
||||
slop,
|
||||
}
|
||||
})
|
||||
prefix,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn slop_or_prefix_val<'a>() -> impl Parser<&'a str, Output = (u32, bool)> {
|
||||
let prefix_val = char('*').map(|_ast| (0, true));
|
||||
let slop_val = slop_val().map(|slop| (slop, false));
|
||||
|
||||
prefix_val.or(slop_val)
|
||||
}
|
||||
|
||||
fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
|
||||
@@ -186,11 +194,14 @@ fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
|
||||
|
||||
fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
|
||||
let term_default_field =
|
||||
(term_val(), slop_val()).map(|((delimiter, phrase), slop)| UserInputLiteral {
|
||||
(term_val(), slop_or_prefix_val()).map(|((delimiter, phrase), (slop, prefix))| {
|
||||
UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase,
|
||||
delimiter,
|
||||
slop,
|
||||
prefix,
|
||||
}
|
||||
});
|
||||
|
||||
attempt(term_query())
|
||||
@@ -872,6 +883,16 @@ mod test {
|
||||
test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_phrase_prefix() {
|
||||
test_parse_query_to_ast_helper("\"a b\"*", "\"a b\"*");
|
||||
test_parse_query_to_ast_helper("\"a\"*", "\"a\"*");
|
||||
test_parse_query_to_ast_helper("\"\"*", "\"\"*");
|
||||
test_parse_query_to_ast_helper("foo:\"a b\"*", "\"foo\":\"a b\"*");
|
||||
test_parse_query_to_ast_helper("foo:\"a\"*", "\"foo\":\"a\"*");
|
||||
test_parse_query_to_ast_helper("foo:\"\"*", "\"foo\":\"\"*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_not_queries_are_consistent() {
|
||||
test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)");
|
||||
|
||||
@@ -66,6 +66,7 @@ pub struct UserInputLiteral {
|
||||
pub phrase: String,
|
||||
pub delimiter: Delimiter,
|
||||
pub slop: u32,
|
||||
pub prefix: bool,
|
||||
}
|
||||
|
||||
impl fmt::Debug for UserInputLiteral {
|
||||
@@ -86,6 +87,8 @@ impl fmt::Debug for UserInputLiteral {
|
||||
}
|
||||
if self.slop > 0 {
|
||||
write!(formatter, "~{}", self.slop)?;
|
||||
} else if self.prefix {
|
||||
write!(formatter, "*")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -88,9 +88,6 @@ impl PhrasePrefixQuery {
|
||||
/// a specialized type [`PhraseQueryWeight`] instead of a Boxed trait.
|
||||
/// If the query was only one term long, this returns `None` wherease [`Query::weight`]
|
||||
/// returns a boxed [`RangeWeight`]
|
||||
///
|
||||
/// Returns `None`, if phrase_terms is empty, which happens if the phrase prefix query was
|
||||
/// built with a single term.
|
||||
pub(crate) fn phrase_prefix_query_weight(
|
||||
&self,
|
||||
enable_scoring: EnableScoring<'_>,
|
||||
|
||||
@@ -8,7 +8,11 @@ use crate::Score;
|
||||
#[derive(Clone)]
|
||||
pub enum LogicalLiteral {
|
||||
Term(Term),
|
||||
Phrase(Vec<(usize, Term)>, u32),
|
||||
Phrase {
|
||||
terms: Vec<(usize, Term)>,
|
||||
slop: u32,
|
||||
prefix: bool,
|
||||
},
|
||||
Range {
|
||||
field: String,
|
||||
value_type: Type,
|
||||
@@ -79,10 +83,16 @@ impl fmt::Debug for LogicalLiteral {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
LogicalLiteral::Term(ref term) => write!(formatter, "{term:?}"),
|
||||
LogicalLiteral::Phrase(ref terms, slop) => {
|
||||
LogicalLiteral::Phrase {
|
||||
ref terms,
|
||||
slop,
|
||||
prefix,
|
||||
} => {
|
||||
write!(formatter, "\"{terms:?}\"")?;
|
||||
if slop > 0 {
|
||||
write!(formatter, "~{slop:?}")
|
||||
} else if prefix {
|
||||
write!(formatter, "*")
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -15,21 +15,12 @@ use crate::core::json_utils::{
|
||||
use crate::core::Index;
|
||||
use crate::query::range_query::{is_type_valid_for_fastfield_range_query, RangeQuery};
|
||||
use crate::query::{
|
||||
AllQuery,
|
||||
BooleanQuery,
|
||||
BoostQuery,
|
||||
EmptyQuery,
|
||||
FuzzyTermQuery,
|
||||
Occur,
|
||||
PhraseQuery,
|
||||
Query,
|
||||
// RangeQuery,
|
||||
TermQuery,
|
||||
TermSetQuery,
|
||||
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhrasePrefixQuery,
|
||||
PhraseQuery, Query, TermQuery, TermSetQuery,
|
||||
};
|
||||
use crate::schema::{
|
||||
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
|
||||
Schema, Term, Type,
|
||||
Schema, Term, TextFieldIndexing, Type,
|
||||
};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
@@ -79,6 +70,17 @@ pub enum QueryParserError {
|
||||
/// have any positions indexed.
|
||||
#[error("The field '{0}' does not have positions indexed")]
|
||||
FieldDoesNotHavePositionsIndexed(String),
|
||||
/// A phrase-prefix query requires at least two terms
|
||||
#[error(
|
||||
"The phrase '{phrase:?}' does not produce at least two terms using the tokenizer \
|
||||
'{tokenizer:?}'"
|
||||
)]
|
||||
PhrasePrefixRequiresAtLeastTwoTerms {
|
||||
/// The phrase which triggered the issue
|
||||
phrase: String,
|
||||
/// The tokenizer configured for the field
|
||||
tokenizer: String,
|
||||
},
|
||||
/// The tokenizer for the given field is unknown
|
||||
/// The two argument strings are the name of the field, the name of the tokenizer
|
||||
#[error("The tokenizer '{tokenizer:?}' for the field '{field:?}' is unknown")]
|
||||
@@ -194,6 +196,10 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
|
||||
///
|
||||
/// Phrase terms support the `~` slop operator which allows to set the phrase's matching
|
||||
/// distance in words. `"big wolf"~1` will return documents containing the phrase `"big bad wolf"`.
|
||||
///
|
||||
/// Phrase terms also support the `*` prefix operator which switches the phrase's matching
|
||||
/// to consider all documents which contain the last term as a prefix, e.g. `"big bad wo"*` will
|
||||
/// match `"big bad wolf"`.
|
||||
#[derive(Clone)]
|
||||
pub struct QueryParser {
|
||||
schema: Schema,
|
||||
@@ -446,6 +452,7 @@ impl QueryParser {
|
||||
json_path: &str,
|
||||
phrase: &str,
|
||||
slop: u32,
|
||||
prefix: bool,
|
||||
) -> Result<Vec<LogicalLiteral>, QueryParserError> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
@@ -486,25 +493,25 @@ impl QueryParser {
|
||||
Ok(vec![LogicalLiteral::Term(dt_term)])
|
||||
}
|
||||
FieldType::Str(ref str_options) => {
|
||||
let option = str_options.get_indexing_options().ok_or_else(|| {
|
||||
let indexing_options = str_options.get_indexing_options().ok_or_else(|| {
|
||||
// This should have been seen earlier really.
|
||||
QueryParserError::FieldNotIndexed(field_name.to_string())
|
||||
})?;
|
||||
let text_analyzer =
|
||||
self.tokenizer_manager
|
||||
.get(option.tokenizer())
|
||||
let text_analyzer = self
|
||||
.tokenizer_manager
|
||||
.get(indexing_options.tokenizer())
|
||||
.ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
field: field_name.to_string(),
|
||||
tokenizer: option.tokenizer().to_string(),
|
||||
tokenizer: indexing_options.tokenizer().to_string(),
|
||||
})?;
|
||||
let index_record_option = option.index_option();
|
||||
Ok(generate_literals_for_str(
|
||||
field_name,
|
||||
field,
|
||||
phrase,
|
||||
slop,
|
||||
prefix,
|
||||
indexing_options,
|
||||
&text_analyzer,
|
||||
index_record_option,
|
||||
)?
|
||||
.into_iter()
|
||||
.collect())
|
||||
@@ -661,9 +668,13 @@ impl QueryParser {
|
||||
self.compute_path_triplets_for_literal(&literal)?;
|
||||
let mut asts: Vec<LogicalAst> = Vec::new();
|
||||
for (field, json_path, phrase) in term_phrases {
|
||||
for ast in
|
||||
self.compute_logical_ast_for_leaf(field, json_path, phrase, literal.slop)?
|
||||
{
|
||||
for ast in self.compute_logical_ast_for_leaf(
|
||||
field,
|
||||
json_path,
|
||||
phrase,
|
||||
literal.slop,
|
||||
literal.prefix,
|
||||
)? {
|
||||
// Apply some field specific boost defined at the query parser level.
|
||||
let boost = self.field_boost(field);
|
||||
asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost));
|
||||
@@ -753,9 +764,17 @@ fn convert_literal_to_query(
|
||||
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs))
|
||||
}
|
||||
}
|
||||
LogicalLiteral::Phrase(term_with_offsets, slop) => Box::new(
|
||||
PhraseQuery::new_with_offset_and_slop(term_with_offsets, slop),
|
||||
),
|
||||
LogicalLiteral::Phrase {
|
||||
terms,
|
||||
slop,
|
||||
prefix,
|
||||
} => {
|
||||
if prefix {
|
||||
Box::new(PhrasePrefixQuery::new_with_offset(terms))
|
||||
} else {
|
||||
Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop))
|
||||
}
|
||||
}
|
||||
LogicalLiteral::Range {
|
||||
field,
|
||||
value_type,
|
||||
@@ -774,8 +793,9 @@ fn generate_literals_for_str(
|
||||
field: Field,
|
||||
phrase: &str,
|
||||
slop: u32,
|
||||
prefix: bool,
|
||||
indexing_options: &TextFieldIndexing,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
index_record_option: IndexRecordOption,
|
||||
) -> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
let mut terms: Vec<(usize, Term)> = Vec::new();
|
||||
let mut token_stream = text_analyzer.token_stream(phrase);
|
||||
@@ -784,18 +804,28 @@ fn generate_literals_for_str(
|
||||
terms.push((token.position, term));
|
||||
});
|
||||
if terms.len() <= 1 {
|
||||
if prefix {
|
||||
return Err(QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms {
|
||||
phrase: phrase.to_owned(),
|
||||
tokenizer: indexing_options.tokenizer().to_owned(),
|
||||
});
|
||||
}
|
||||
let term_literal_opt = terms
|
||||
.into_iter()
|
||||
.next()
|
||||
.map(|(_, term)| LogicalLiteral::Term(term));
|
||||
return Ok(term_literal_opt);
|
||||
}
|
||||
if !index_record_option.has_positions() {
|
||||
if !indexing_options.index_option().has_positions() {
|
||||
return Err(QueryParserError::FieldDoesNotHavePositionsIndexed(
|
||||
field_name.to_string(),
|
||||
));
|
||||
}
|
||||
Ok(Some(LogicalLiteral::Phrase(terms, slop)))
|
||||
Ok(Some(LogicalLiteral::Phrase {
|
||||
terms,
|
||||
slop,
|
||||
prefix,
|
||||
}))
|
||||
}
|
||||
|
||||
fn generate_literals_for_json_object(
|
||||
@@ -841,7 +871,11 @@ fn generate_literals_for_json_object(
|
||||
field_name.to_string(),
|
||||
));
|
||||
}
|
||||
logical_literals.push(LogicalLiteral::Phrase(terms, 0));
|
||||
logical_literals.push(LogicalLiteral::Phrase {
|
||||
terms,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
});
|
||||
Ok(logical_literals)
|
||||
}
|
||||
|
||||
@@ -1643,6 +1677,48 @@ mod test {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_prefix() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"\"big bad wo\"*",
|
||||
r#"("[(0, Term(field=0, type=Str, "big")), (1, Term(field=0, type=Str, "bad")), (2, Term(field=0, type=Str, "wo"))]"* "[(0, Term(field=1, type=Str, "big")), (1, Term(field=1, type=Str, "bad")), (2, Term(field=1, type=Str, "wo"))]"*)"#,
|
||||
false,
|
||||
);
|
||||
|
||||
let query_parser = make_query_parser();
|
||||
let query = query_parser.parse_query("\"big bad wo\"*").unwrap();
|
||||
assert_eq!(
|
||||
format!("{query:?}"),
|
||||
"BooleanQuery { subqueries: [(Should, PhrasePrefixQuery { field: Field(0), \
|
||||
phrase_terms: [(0, Term(field=0, type=Str, \"big\")), (1, Term(field=0, type=Str, \
|
||||
\"bad\"))], prefix: (2, Term(field=0, type=Str, \"wo\")), max_expansions: 50 }), \
|
||||
(Should, PhrasePrefixQuery { field: Field(1), phrase_terms: [(0, Term(field=1, \
|
||||
type=Str, \"big\")), (1, Term(field=1, type=Str, \"bad\"))], prefix: (2, \
|
||||
Term(field=1, type=Str, \"wo\")), max_expansions: 50 })] }"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_prefix_too_short() {
|
||||
let err = parse_query_to_logical_ast("\"wo\"*", true).unwrap_err();
|
||||
assert_eq!(
|
||||
err,
|
||||
QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms {
|
||||
phrase: "wo".to_owned(),
|
||||
tokenizer: "default".to_owned()
|
||||
}
|
||||
);
|
||||
|
||||
let err = parse_query_to_logical_ast("\"\"*", true).unwrap_err();
|
||||
assert_eq!(
|
||||
err,
|
||||
QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms {
|
||||
phrase: "".to_owned(),
|
||||
tokenizer: "default".to_owned()
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_term_set_query() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
|
||||
Reference in New Issue
Block a user