Expose phrase-prefix queries via the built-in query parser (#2044)

* Expose phrase-prefix queries via the built-in query parser This proposes the less-than-imaginative syntax `field:"phrase ter"*` to perform a phrase prefix query against `field` using `phrase` and `ter` as the terms. The aim of this is to make this type of query more discoverable and simplify manual testing. I did consider exposing the `max_expansions` parameter similar to how slop is handled, but I think that this is rather something that should be configured via the querser parser (similar to `set_field_boost` and `set_field_fuzzy`) as choosing it requires rather intimiate knowledge of the backing index. * Prevent construction of zero or one term phrase-prefix queries via the query parser. * Add example using phrase-prefix search via surface API to improve feature discoverability.
2025-12-23 02:29:57 +00:00 · 2023-06-01 13:03:16 +02:00
parent 7ee78bda52
commit b325d569ad
6 changed files with 232 additions and 46 deletions
--- a/examples/phrase_prefix_search.rs
+++ b/examples/phrase_prefix_search.rs
@@ -0,0 +1,79 @@
+use tantivy::collector::TopDocs;
+use tantivy::query::QueryParser;
+use tantivy::schema::*;
+use tantivy::{doc, Index, ReloadPolicy, Result};
+use tempfile::TempDir;
+
+fn main() -> Result<()> {
+    let index_path = TempDir::new()?;
+
+    let mut schema_builder = Schema::builder();
+    schema_builder.add_text_field("title", TEXT | STORED);
+    schema_builder.add_text_field("body", TEXT);
+    let schema = schema_builder.build();
+
+    let title = schema.get_field("title").unwrap();
+    let body = schema.get_field("body").unwrap();
+
+    let index = Index::create_in_dir(&index_path, schema)?;
+
+    let mut index_writer = index.writer(50_000_000)?;
+
+    index_writer.add_document(doc!(
+    title => "The Old Man and the Sea",
+    body => "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone \
+            eighty-four days now without taking a fish.",
+    ))?;
+
+    index_writer.add_document(doc!(
+    title => "Of Mice and Men",
+    body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
+            bank and runs deep and green. The water is warm too, for it has slipped twinkling \
+            over the yellow sands in the sunlight before reaching the narrow pool. On one \
+            side of the river the golden foothill slopes curve up to the strong and rocky \
+            Gabilan Mountains, but on the valley side the water is lined with trees—willows \
+            fresh and green with every spring, carrying in their lower leaf junctures the \
+            debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
+            limbs and branches that arch over the pool"
+    ))?;
+
+    // Multivalued field just need to be repeated.
+    index_writer.add_document(doc!(
+    title => "Frankenstein",
+    title => "The Modern Prometheus",
+    body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
+             enterprise which you have regarded with such evil forebodings.  I arrived here \
+             yesterday, and my first task is to assure my dear sister of my welfare and \
+             increasing confidence in the success of my undertaking."
+    ))?;
+
+    index_writer.commit()?;
+
+    let reader = index
+        .reader_builder()
+        .reload_policy(ReloadPolicy::OnCommit)
+        .try_into()?;
+
+    let searcher = reader.searcher();
+
+    let query_parser = QueryParser::for_index(&index, vec![title, body]);
+    // This will match documents containing the phrase "in the"
+    // followed by some word starting with "su",
+    // i.e. it will match "in the sunlight" and "in the success",
+    // but not "in the Gulf Stream".
+    let query = query_parser.parse_query("\"in the su\"*")?;
+
+    let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
+    let mut titles = top_docs
+        .into_iter()
+        .map(|(_score, doc_address)| {
+            let doc = searcher.doc(doc_address)?;
+            let title = doc.get_first(title).unwrap().as_text().unwrap().to_owned();
+            Ok(title)
+        })
+        .collect::<Result<Vec<_>>>()?;
+    titles.sort_unstable();
+    assert_eq!(titles, ["Frankenstein", "Of Mice and Men"]);
+
+    Ok(())
+}
--- a/query-grammar/src/query_grammar.rs
+++ b/query-grammar/src/query_grammar.rs
@@ -162,14 +162,22 @@ fn term_val<'a>() -> impl Parser<&'a str, Output = (Delimiter, String)> {
 }

 fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
-    (field_name(), term_val(), slop_val()).map(|(field_name, (delimiter, phrase), slop)| {
-        UserInputLiteral {
+    (field_name(), term_val(), slop_or_prefix_val()).map(
+        |(field_name, (delimiter, phrase), (slop, prefix))| UserInputLiteral {
            field_name: Some(field_name),
            phrase,
            delimiter,
            slop,
-        }
-    })
+            prefix,
+        },
+    )
+}
+
+fn slop_or_prefix_val<'a>() -> impl Parser<&'a str, Output = (u32, bool)> {
+    let prefix_val = char('*').map(|_ast| (0, true));
+    let slop_val = slop_val().map(|slop| (slop, false));
+
+    prefix_val.or(slop_val)
 }

 fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
@@ -186,11 +194,14 @@ fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {

 fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
    let term_default_field =
-        (term_val(), slop_val()).map(|((delimiter, phrase), slop)| UserInputLiteral {
+        (term_val(), slop_or_prefix_val()).map(|((delimiter, phrase), (slop, prefix))| {
+            UserInputLiteral {
                field_name: None,
                phrase,
                delimiter,
                slop,
+                prefix,
+            }
        });

    attempt(term_query())
@@ -872,6 +883,16 @@ mod test {
        test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2");
    }

+    #[test]
+    fn test_phrase_prefix() {
+        test_parse_query_to_ast_helper("\"a b\"*", "\"a b\"*");
+        test_parse_query_to_ast_helper("\"a\"*", "\"a\"*");
+        test_parse_query_to_ast_helper("\"\"*", "\"\"*");
+        test_parse_query_to_ast_helper("foo:\"a b\"*", "\"foo\":\"a b\"*");
+        test_parse_query_to_ast_helper("foo:\"a\"*", "\"foo\":\"a\"*");
+        test_parse_query_to_ast_helper("foo:\"\"*", "\"foo\":\"\"*");
+    }
+
    #[test]
    fn test_not_queries_are_consistent() {
        test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)");
--- a/query-grammar/src/user_input_ast.rs
+++ b/query-grammar/src/user_input_ast.rs
@@ -66,6 +66,7 @@ pub struct UserInputLiteral {
    pub phrase: String,
    pub delimiter: Delimiter,
    pub slop: u32,
+    pub prefix: bool,
 }

 impl fmt::Debug for UserInputLiteral {
@@ -86,6 +87,8 @@ impl fmt::Debug for UserInputLiteral {
        }
        if self.slop > 0 {
            write!(formatter, "~{}", self.slop)?;
+        } else if self.prefix {
+            write!(formatter, "*")?;
        }
        Ok(())
    }
--- a/src/query/phrase_prefix_query/phrase_prefix_query.rs
+++ b/src/query/phrase_prefix_query/phrase_prefix_query.rs
@@ -88,9 +88,6 @@ impl PhrasePrefixQuery {
    /// a specialized type [`PhraseQueryWeight`] instead of a Boxed trait.
    /// If the query was only one term long, this returns `None` wherease [`Query::weight`]
    /// returns a boxed [`RangeWeight`]
-    ///
-    /// Returns `None`, if phrase_terms is empty, which happens if the phrase prefix query was
-    /// built with a single term.
    pub(crate) fn phrase_prefix_query_weight(
        &self,
        enable_scoring: EnableScoring<'_>,
--- a/src/query/query_parser/logical_ast.rs
+++ b/src/query/query_parser/logical_ast.rs
@@ -8,7 +8,11 @@ use crate::Score;
 #[derive(Clone)]
 pub enum LogicalLiteral {
    Term(Term),
-    Phrase(Vec<(usize, Term)>, u32),
+    Phrase {
+        terms: Vec<(usize, Term)>,
+        slop: u32,
+        prefix: bool,
+    },
    Range {
        field: String,
        value_type: Type,
@@ -79,10 +83,16 @@ impl fmt::Debug for LogicalLiteral {
    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
        match *self {
            LogicalLiteral::Term(ref term) => write!(formatter, "{term:?}"),
-            LogicalLiteral::Phrase(ref terms, slop) => {
+            LogicalLiteral::Phrase {
+                ref terms,
+                slop,
+                prefix,
+            } => {
                write!(formatter, "\"{terms:?}\"")?;
                if slop > 0 {
                    write!(formatter, "~{slop:?}")
+                } else if prefix {
+                    write!(formatter, "*")
                } else {
                    Ok(())
                }
--- a/src/query/query_parser/query_parser.rs
+++ b/src/query/query_parser/query_parser.rs
@@ -15,21 +15,12 @@ use crate::core::json_utils::{
 use crate::core::Index;
 use crate::query::range_query::{is_type_valid_for_fastfield_range_query, RangeQuery};
 use crate::query::{
-    AllQuery,
-    BooleanQuery,
-    BoostQuery,
-    EmptyQuery,
-    FuzzyTermQuery,
-    Occur,
-    PhraseQuery,
-    Query,
-    // RangeQuery,
-    TermQuery,
-    TermSetQuery,
+    AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhrasePrefixQuery,
+    PhraseQuery, Query, TermQuery, TermSetQuery,
 };
 use crate::schema::{
    Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
-    Schema, Term, Type,
+    Schema, Term, TextFieldIndexing, Type,
 };
 use crate::time::format_description::well_known::Rfc3339;
 use crate::time::OffsetDateTime;
@@ -79,6 +70,17 @@ pub enum QueryParserError {
    /// have any positions indexed.
    #[error("The field '{0}' does not have positions indexed")]
    FieldDoesNotHavePositionsIndexed(String),
+    /// A phrase-prefix query requires at least two terms
+    #[error(
+        "The phrase '{phrase:?}' does not produce at least two terms using the tokenizer \
+         '{tokenizer:?}'"
+    )]
+    PhrasePrefixRequiresAtLeastTwoTerms {
+        /// The phrase which triggered the issue
+        phrase: String,
+        /// The tokenizer configured for the field
+        tokenizer: String,
+    },
    /// The tokenizer for the given field is unknown
    /// The two argument strings are the name of the field, the name of the tokenizer
    #[error("The tokenizer '{tokenizer:?}' for the field '{field:?}' is unknown")]
@@ -194,6 +196,10 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
 ///
 /// Phrase terms support the `~` slop operator which allows to set the phrase's matching
 /// distance in words. `"big wolf"~1` will return documents containing the phrase `"big bad wolf"`.
+///
+/// Phrase terms also support the `*` prefix operator which switches the phrase's matching
+/// to consider all documents which contain the last term as a prefix, e.g. `"big bad wo"*` will
+/// match `"big bad wolf"`.
 #[derive(Clone)]
 pub struct QueryParser {
    schema: Schema,
@@ -446,6 +452,7 @@ impl QueryParser {
        json_path: &str,
        phrase: &str,
        slop: u32,
+        prefix: bool,
    ) -> Result<Vec<LogicalLiteral>, QueryParserError> {
        let field_entry = self.schema.get_field_entry(field);
        let field_type = field_entry.field_type();
@@ -486,25 +493,25 @@ impl QueryParser {
                Ok(vec![LogicalLiteral::Term(dt_term)])
            }
            FieldType::Str(ref str_options) => {
-                let option = str_options.get_indexing_options().ok_or_else(|| {
+                let indexing_options = str_options.get_indexing_options().ok_or_else(|| {
                    // This should have been seen earlier really.
                    QueryParserError::FieldNotIndexed(field_name.to_string())
                })?;
-                let text_analyzer =
-                    self.tokenizer_manager
-                        .get(option.tokenizer())
+                let text_analyzer = self
+                    .tokenizer_manager
+                    .get(indexing_options.tokenizer())
                    .ok_or_else(|| QueryParserError::UnknownTokenizer {
                        field: field_name.to_string(),
-                            tokenizer: option.tokenizer().to_string(),
+                        tokenizer: indexing_options.tokenizer().to_string(),
                    })?;
-                let index_record_option = option.index_option();
                Ok(generate_literals_for_str(
                    field_name,
                    field,
                    phrase,
                    slop,
+                    prefix,
+                    indexing_options,
                    &text_analyzer,
-                    index_record_option,
                )?
                .into_iter()
                .collect())
@@ -661,9 +668,13 @@ impl QueryParser {
                    self.compute_path_triplets_for_literal(&literal)?;
                let mut asts: Vec<LogicalAst> = Vec::new();
                for (field, json_path, phrase) in term_phrases {
-                    for ast in
-                        self.compute_logical_ast_for_leaf(field, json_path, phrase, literal.slop)?
-                    {
+                    for ast in self.compute_logical_ast_for_leaf(
+                        field,
+                        json_path,
+                        phrase,
+                        literal.slop,
+                        literal.prefix,
+                    )? {
                        // Apply some field specific boost defined at the query parser level.
                        let boost = self.field_boost(field);
                        asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost));
@@ -753,9 +764,17 @@ fn convert_literal_to_query(
                Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs))
            }
        }
-        LogicalLiteral::Phrase(term_with_offsets, slop) => Box::new(
-            PhraseQuery::new_with_offset_and_slop(term_with_offsets, slop),
-        ),
+        LogicalLiteral::Phrase {
+            terms,
+            slop,
+            prefix,
+        } => {
+            if prefix {
+                Box::new(PhrasePrefixQuery::new_with_offset(terms))
+            } else {
+                Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop))
+            }
+        }
        LogicalLiteral::Range {
            field,
            value_type,
@@ -774,8 +793,9 @@ fn generate_literals_for_str(
    field: Field,
    phrase: &str,
    slop: u32,
+    prefix: bool,
+    indexing_options: &TextFieldIndexing,
    text_analyzer: &TextAnalyzer,
-    index_record_option: IndexRecordOption,
 ) -> Result<Option<LogicalLiteral>, QueryParserError> {
    let mut terms: Vec<(usize, Term)> = Vec::new();
    let mut token_stream = text_analyzer.token_stream(phrase);
@@ -784,18 +804,28 @@ fn generate_literals_for_str(
        terms.push((token.position, term));
    });
    if terms.len() <= 1 {
+        if prefix {
+            return Err(QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms {
+                phrase: phrase.to_owned(),
+                tokenizer: indexing_options.tokenizer().to_owned(),
+            });
+        }
        let term_literal_opt = terms
            .into_iter()
            .next()
            .map(|(_, term)| LogicalLiteral::Term(term));
        return Ok(term_literal_opt);
    }
-    if !index_record_option.has_positions() {
+    if !indexing_options.index_option().has_positions() {
        return Err(QueryParserError::FieldDoesNotHavePositionsIndexed(
            field_name.to_string(),
        ));
    }
-    Ok(Some(LogicalLiteral::Phrase(terms, slop)))
+    Ok(Some(LogicalLiteral::Phrase {
+        terms,
+        slop,
+        prefix,
+    }))
 }

 fn generate_literals_for_json_object(
@@ -841,7 +871,11 @@ fn generate_literals_for_json_object(
            field_name.to_string(),
        ));
    }
-    logical_literals.push(LogicalLiteral::Phrase(terms, 0));
+    logical_literals.push(LogicalLiteral::Phrase {
+        terms,
+        slop: 0,
+        prefix: false,
+    });
    Ok(logical_literals)
 }

@@ -1643,6 +1677,48 @@ mod test {
        );
    }

+    #[test]
+    pub fn test_phrase_prefix() {
+        test_parse_query_to_logical_ast_helper(
+            "\"big bad wo\"*",
+            r#"("[(0, Term(field=0, type=Str, "big")), (1, Term(field=0, type=Str, "bad")), (2, Term(field=0, type=Str, "wo"))]"* "[(0, Term(field=1, type=Str, "big")), (1, Term(field=1, type=Str, "bad")), (2, Term(field=1, type=Str, "wo"))]"*)"#,
+            false,
+        );
+
+        let query_parser = make_query_parser();
+        let query = query_parser.parse_query("\"big bad wo\"*").unwrap();
+        assert_eq!(
+            format!("{query:?}"),
+            "BooleanQuery { subqueries: [(Should, PhrasePrefixQuery { field: Field(0), \
+             phrase_terms: [(0, Term(field=0, type=Str, \"big\")), (1, Term(field=0, type=Str, \
+             \"bad\"))], prefix: (2, Term(field=0, type=Str, \"wo\")), max_expansions: 50 }), \
+             (Should, PhrasePrefixQuery { field: Field(1), phrase_terms: [(0, Term(field=1, \
+             type=Str, \"big\")), (1, Term(field=1, type=Str, \"bad\"))], prefix: (2, \
+             Term(field=1, type=Str, \"wo\")), max_expansions: 50 })] }"
+        );
+    }
+
+    #[test]
+    pub fn test_phrase_prefix_too_short() {
+        let err = parse_query_to_logical_ast("\"wo\"*", true).unwrap_err();
+        assert_eq!(
+            err,
+            QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms {
+                phrase: "wo".to_owned(),
+                tokenizer: "default".to_owned()
+            }
+        );
+
+        let err = parse_query_to_logical_ast("\"\"*", true).unwrap_err();
+        assert_eq!(
+            err,
+            QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms {
+                phrase: "".to_owned(),
+                tokenizer: "default".to_owned()
+            }
+        );
+    }
+
    #[test]
    pub fn test_term_set_query() {
        test_parse_query_to_logical_ast_helper(