add PhrasePrefixQuery (#1842)

* add PhrasePrefixQuery
This commit is contained in:
trinity-1686a
2023-02-22 11:18:33 +01:00
committed by GitHub
parent c7278b3258
commit 533ad99cd5
7 changed files with 696 additions and 2 deletions

View File

@@ -12,6 +12,7 @@ mod explanation;
mod fuzzy_query;
mod intersection;
mod more_like_this;
mod phrase_prefix_query;
mod phrase_query;
mod query;
mod query_parser;
@@ -47,6 +48,7 @@ pub(crate) use self::fuzzy_query::DfaWrapper;
pub use self::fuzzy_query::FuzzyTermQuery;
pub use self::intersection::{intersect_scorers, Intersection};
pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
pub use self::phrase_prefix_query::PhrasePrefixQuery;
pub use self::phrase_query::PhraseQuery;
pub use self::query::{EnableScoring, Query, QueryClone};
pub use self::query_parser::{QueryParser, QueryParserError};

View File

@@ -0,0 +1,34 @@
mod phrase_prefix_query;
mod phrase_prefix_scorer;
mod phrase_prefix_weight;
pub use phrase_prefix_query::PhrasePrefixQuery;
pub use phrase_prefix_scorer::PhrasePrefixScorer;
pub use phrase_prefix_weight::PhrasePrefixWeight;
fn prefix_end(prefix_start: &[u8]) -> Option<Vec<u8>> {
let mut res = prefix_start.to_owned();
while !res.is_empty() {
let end = res.len() - 1;
if res[end] == u8::MAX {
res.pop();
} else {
res[end] += 1;
return Some(res);
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_prefix_end() {
assert_eq!(prefix_end(b"aaa"), Some(b"aab".to_vec()));
assert_eq!(prefix_end(b"aa\xff"), Some(b"ab".to_vec()));
assert_eq!(prefix_end(b"a\xff\xff"), Some(b"b".to_vec()));
assert_eq!(prefix_end(b"\xff\xff\xff"), None);
}
}

View File

@@ -0,0 +1,168 @@
use std::ops::Bound;
use super::{prefix_end, PhrasePrefixWeight};
use crate::query::bm25::Bm25Weight;
use crate::query::{EnableScoring, Query, RangeQuery, Weight};
use crate::schema::{Field, IndexRecordOption, Term};
const DEFAULT_MAX_EXPANSIONS: u32 = 50;
/// `PhrasePrefixQuery` matches a specific sequence of words followed by term of which only a
/// prefix is known.
///
/// For instance the phrase prefix query for `"part t"` will match
/// the sentence
///
/// **Alan just got a part time job.**
///
/// On the other hand it will not match the sentence.
///
/// **This is my favorite part of the job.**
///
/// Using a `PhrasePrefixQuery` on a field requires positions
/// to be indexed for this field.
#[derive(Clone, Debug)]
pub struct PhrasePrefixQuery {
field: Field,
phrase_terms: Vec<(usize, Term)>,
prefix: (usize, Term),
max_expansions: u32,
}
impl PhrasePrefixQuery {
/// Creates a new `PhrasePrefixQuery` given a list of terms.
///
/// There must be at least two terms, and all terms
/// must belong to the same field.
/// Offset for each term will be same as index in the Vector
/// The last Term is a prefix and not a full value
pub fn new(terms: Vec<Term>) -> PhrasePrefixQuery {
let terms_with_offset = terms.into_iter().enumerate().collect();
PhrasePrefixQuery::new_with_offset(terms_with_offset)
}
/// Creates a new `PhrasePrefixQuery` given a list of terms and their offsets.
///
/// Can be used to provide custom offset for each term.
pub fn new_with_offset(mut terms: Vec<(usize, Term)>) -> PhrasePrefixQuery {
assert!(
!terms.is_empty(),
"A phrase prefix query is required to have at least one term."
);
terms.sort_by_key(|&(offset, _)| offset);
let field = terms[0].1.field();
assert!(
terms[1..].iter().all(|term| term.1.field() == field),
"All terms from a phrase query must belong to the same field"
);
PhrasePrefixQuery {
field,
prefix: terms.pop().unwrap(),
phrase_terms: terms,
max_expansions: DEFAULT_MAX_EXPANSIONS,
}
}
/// Maximum number of terms to which the last provided term will expand.
pub fn set_max_expansions(&mut self, value: u32) {
self.max_expansions = value;
}
/// The [`Field`] this `PhrasePrefixQuery` is targeting.
pub fn field(&self) -> Field {
self.field
}
/// `Term`s in the phrase without the associated offsets.
pub fn phrase_terms(&self) -> Vec<Term> {
// TODO should we include the last term too?
self.phrase_terms
.iter()
.map(|(_, term)| term.clone())
.collect::<Vec<Term>>()
}
/// Returns the [`PhrasePrefixWeight`] for the given phrase query given a specific `searcher`.
///
/// This function is the same as [`Query::weight()`] except it returns
/// a specialized type [`PhraseQueryWeight`] instead of a Boxed trait.
/// If the query was only one term long, this returns `None` wherease [`Query::weight`]
/// returns a boxed [`RangeWeight`]
///
/// Returns `None`, if phrase_terms is empty, which happens if the phrase prefix query was
/// built with a single term.
pub(crate) fn phrase_prefix_query_weight(
&self,
enable_scoring: EnableScoring<'_>,
) -> crate::Result<Option<PhrasePrefixWeight>> {
if self.phrase_terms.is_empty() {
return Ok(None);
}
let schema = enable_scoring.schema();
let field_entry = schema.get_field_entry(self.field);
let has_positions = field_entry
.field_type()
.get_index_record_option()
.map(IndexRecordOption::has_positions)
.unwrap_or(false);
if !has_positions {
let field_name = field_entry.name();
return Err(crate::TantivyError::SchemaError(format!(
"Applied phrase query on field {:?}, which does not have positions indexed",
field_name
)));
}
let terms = self.phrase_terms();
let bm25_weight_opt = match enable_scoring {
EnableScoring::Enabled { searcher, .. } => {
Some(Bm25Weight::for_terms(searcher, &terms)?)
}
EnableScoring::Disabled { .. } => None,
};
let weight = PhrasePrefixWeight::new(
self.phrase_terms.clone(),
self.prefix.clone(),
bm25_weight_opt,
self.max_expansions,
);
Ok(Some(weight))
}
}
impl Query for PhrasePrefixQuery {
/// Create the weight associated with a query.
///
/// See [`Weight`].
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
if let Some(phrase_weight) = self.phrase_prefix_query_weight(enable_scoring)? {
Ok(Box::new(phrase_weight))
} else {
// There are no prefix. Let's just match the suffix.
let end_term = if let Some(end_value) = prefix_end(&self.prefix.1.value_bytes()) {
let mut end_term = Term::with_capacity(end_value.len());
end_term.set_field_and_type(self.field, self.prefix.1.typ());
end_term.append_bytes(&end_value);
Bound::Excluded(end_term)
} else {
Bound::Unbounded
};
RangeQuery::new_term_bounds(
enable_scoring
.schema()
.get_field_name(self.field)
.to_owned(),
self.prefix.1.typ(),
&Bound::Included(self.prefix.1.clone()),
&end_term,
)
.weight(enable_scoring)
}
}
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
for (_, term) in &self.phrase_terms {
visitor(term, true);
}
}
}

View File

@@ -0,0 +1,207 @@
use crate::docset::{DocSet, TERMINATED};
use crate::fieldnorm::FieldNormReader;
use crate::postings::Postings;
use crate::query::bm25::Bm25Weight;
use crate::query::phrase_query::{intersection_count, PhraseScorer};
use crate::query::Scorer;
use crate::{DocId, Score};
enum PhraseKind<TPostings: Postings> {
SinglePrefix {
position_offset: u32,
postings: TPostings,
positions: Vec<u32>,
},
MultiPrefix(PhraseScorer<TPostings>),
}
impl<TPostings: Postings> PhraseKind<TPostings> {
fn get_intersection(&mut self) -> &[u32] {
match self {
PhraseKind::SinglePrefix {
position_offset,
postings,
positions,
} => {
if positions.is_empty() {
postings.positions_with_offset(*position_offset, positions);
}
positions
}
PhraseKind::MultiPrefix(postings) => postings.get_intersection(),
}
}
}
impl<TPostings: Postings> DocSet for PhraseKind<TPostings> {
fn advance(&mut self) -> DocId {
match self {
PhraseKind::SinglePrefix {
postings,
positions,
..
} => {
positions.clear();
postings.advance()
}
PhraseKind::MultiPrefix(postings) => postings.advance(),
}
}
fn doc(&self) -> DocId {
match self {
PhraseKind::SinglePrefix { postings, .. } => postings.doc(),
PhraseKind::MultiPrefix(postings) => postings.doc(),
}
}
fn size_hint(&self) -> u32 {
match self {
PhraseKind::SinglePrefix { postings, .. } => postings.size_hint(),
PhraseKind::MultiPrefix(postings) => postings.size_hint(),
}
}
fn seek(&mut self, target: DocId) -> DocId {
match self {
PhraseKind::SinglePrefix {
postings,
positions,
..
} => {
positions.clear();
postings.seek(target)
}
PhraseKind::MultiPrefix(postings) => postings.seek(target),
}
}
}
impl<TPostings: Postings> Scorer for PhraseKind<TPostings> {
fn score(&mut self) -> Score {
match self {
PhraseKind::SinglePrefix { positions, .. } => {
if positions.is_empty() {
0.0
} else {
1.0
}
}
PhraseKind::MultiPrefix(postings) => postings.score(),
}
}
}
pub struct PhrasePrefixScorer<TPostings: Postings> {
phrase_scorer: PhraseKind<TPostings>,
suffixes: Vec<TPostings>,
suffix_offset: u32,
phrase_count: u32,
}
impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
// If similarity_weight is None, then scoring is disabled.
pub fn new(
mut term_postings: Vec<(usize, TPostings)>,
similarity_weight_opt: Option<Bm25Weight>,
fieldnorm_reader: FieldNormReader,
suffixes: Vec<TPostings>,
suffix_pos: usize,
) -> PhrasePrefixScorer<TPostings> {
// correct indices so we can merge with our suffix term the PhraseScorer doesn't know about
let max_offset = term_postings
.iter()
.map(|(pos, _)| *pos)
.chain(std::iter::once(suffix_pos))
.max()
.unwrap();
let phrase_scorer = if term_postings.len() > 1 {
PhraseKind::MultiPrefix(PhraseScorer::new_with_offset(
term_postings,
similarity_weight_opt,
fieldnorm_reader,
0,
1,
))
} else {
let (pos, postings) = term_postings
.pop()
.expect("PhrasePrefixScorer must have at least two terms");
let offset = suffix_pos - pos;
PhraseKind::SinglePrefix {
position_offset: offset as u32,
postings,
positions: Vec::with_capacity(100),
}
};
let mut phrase_prefix_scorer = PhrasePrefixScorer {
phrase_scorer,
suffixes,
suffix_offset: (max_offset - suffix_pos) as u32,
phrase_count: 0,
};
if !phrase_prefix_scorer.matches_prefix() {
phrase_prefix_scorer.advance();
}
phrase_prefix_scorer
}
pub fn phrase_count(&self) -> u32 {
self.phrase_count
}
fn matches_prefix(&mut self) -> bool {
let mut count = 0;
let mut positions = Vec::new();
let current_doc = self.doc();
let pos_matching = self.phrase_scorer.get_intersection();
for suffix in &mut self.suffixes {
if suffix.doc() > current_doc {
continue;
}
let doc = suffix.seek(current_doc);
if doc == current_doc {
suffix.positions_with_offset(self.suffix_offset, &mut positions);
count += intersection_count(pos_matching, &positions);
}
}
self.phrase_count = count as u32;
count != 0
}
}
impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
fn advance(&mut self) -> DocId {
loop {
let doc = self.phrase_scorer.advance();
if doc == TERMINATED || self.matches_prefix() {
return doc;
}
}
}
fn seek(&mut self, target: DocId) -> DocId {
self.phrase_scorer.seek(target);
let doc = self.phrase_scorer.seek(target);
if doc == TERMINATED || self.matches_prefix() {
return doc;
}
self.advance()
}
fn doc(&self) -> DocId {
self.phrase_scorer.doc()
}
fn size_hint(&self) -> u32 {
self.phrase_scorer.size_hint()
}
}
impl<TPostings: Postings> Scorer for PhrasePrefixScorer<TPostings> {
fn score(&mut self) -> Score {
// TODO modify score??
self.phrase_scorer.score()
}
}

View File

@@ -0,0 +1,260 @@
use super::{prefix_end, PhrasePrefixScorer};
use crate::core::SegmentReader;
use crate::fieldnorm::FieldNormReader;
use crate::postings::SegmentPostings;
use crate::query::bm25::Bm25Weight;
use crate::query::explanation::does_not_match;
use crate::query::{EmptyScorer, Explanation, Scorer, Weight};
use crate::schema::{IndexRecordOption, Term};
use crate::{DocId, DocSet, Score};
pub struct PhrasePrefixWeight {
phrase_terms: Vec<(usize, Term)>,
prefix: (usize, Term),
similarity_weight_opt: Option<Bm25Weight>,
max_expansions: u32,
}
impl PhrasePrefixWeight {
/// Creates a new phrase weight.
/// If `similarity_weight_opt` is None, then scoring is disabled
pub fn new(
phrase_terms: Vec<(usize, Term)>,
prefix: (usize, Term),
similarity_weight_opt: Option<Bm25Weight>,
max_expansions: u32,
) -> PhrasePrefixWeight {
PhrasePrefixWeight {
phrase_terms,
prefix,
similarity_weight_opt,
max_expansions,
}
}
fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> {
let field = self.phrase_terms[0].1.field();
if self.similarity_weight_opt.is_some() {
if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? {
return Ok(fieldnorm_reader);
}
}
Ok(FieldNormReader::constant(reader.max_doc(), 1))
}
pub(crate) fn phrase_scorer(
&self,
reader: &SegmentReader,
boost: Score,
) -> crate::Result<Option<PhrasePrefixScorer<SegmentPostings>>> {
let similarity_weight_opt = self
.similarity_weight_opt
.as_ref()
.map(|similarity_weight| similarity_weight.boost_by(boost));
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
let mut term_postings_list = Vec::new();
if reader.has_deletes() {
for &(offset, ref term) in &self.phrase_terms {
if let Some(postings) = reader
.inverted_index(term.field())?
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
{
term_postings_list.push((offset, postings));
} else {
return Ok(None);
}
}
} else {
for &(offset, ref term) in &self.phrase_terms {
if let Some(postings) = reader
.inverted_index(term.field())?
.read_postings_no_deletes(term, IndexRecordOption::WithFreqsAndPositions)?
{
term_postings_list.push((offset, postings));
} else {
return Ok(None);
}
}
}
let inv_index = reader.inverted_index(self.prefix.1.field())?;
let mut stream = inv_index.terms().range().ge(self.prefix.1.value_bytes());
if let Some(end) = prefix_end(self.prefix.1.value_bytes()) {
stream = stream.lt(&end);
}
#[cfg(feature = "quickwit")]
{
// We don't have this on the fst, hence we end up needing a feature flag.
//
// This is not a problem however as we enforce the limit below too.
// The point of `stream.limit` is to limit the number of term dictionary
// blocks being downloaded.
stream = stream.limit(self.max_expansions as u64);
}
let mut stream = stream.into_stream()?;
let mut suffixes = Vec::with_capacity(self.max_expansions as usize);
let mut new_term = self.prefix.1.clone();
while stream.advance() && (suffixes.len() as u32) < self.max_expansions {
new_term.clear_with_type(new_term.typ());
new_term.append_bytes(stream.key());
if reader.has_deletes() {
if let Some(postings) =
inv_index.read_postings(&new_term, IndexRecordOption::WithFreqsAndPositions)?
{
suffixes.push(postings);
}
} else {
if let Some(postings) = inv_index
.read_postings_no_deletes(&new_term, IndexRecordOption::WithFreqsAndPositions)?
{
suffixes.push(postings);
}
}
}
Ok(Some(PhrasePrefixScorer::new(
term_postings_list,
similarity_weight_opt,
fieldnorm_reader,
suffixes,
self.prefix.0,
)))
}
}
impl Weight for PhrasePrefixWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
if let Some(scorer) = self.phrase_scorer(reader, boost)? {
Ok(Box::new(scorer))
} else {
Ok(Box::new(EmptyScorer))
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let scorer_opt = self.phrase_scorer(reader, 1.0)?;
if scorer_opt.is_none() {
return Err(does_not_match(doc));
}
let mut scorer = scorer_opt.unwrap();
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
let phrase_count = scorer.phrase_count();
let mut explanation = Explanation::new("Phrase Prefix Scorer", scorer.score());
if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() {
explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count));
}
Ok(explanation)
}
}
#[cfg(test)]
mod tests {
use crate::core::Index;
use crate::docset::TERMINATED;
use crate::query::{EnableScoring, PhrasePrefixQuery, Query};
use crate::schema::{Schema, TEXT};
use crate::{DocSet, Term};
pub fn create_index(texts: &[&'static str]) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
for &text in texts {
let doc = doc!(text_field=>text);
index_writer.add_document(doc)?;
}
index_writer.commit()?;
}
Ok(index)
}
#[test]
pub fn test_phrase_count_long() -> crate::Result<()> {
let index = create_index(&[
"aa bb dd cc",
"aa aa bb c dd aa bb cc aa bb dc",
" aa bb cd",
])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let phrase_query = PhrasePrefixQuery::new(vec![
Term::from_field_text(text_field, "aa"),
Term::from_field_text(text_field, "bb"),
Term::from_field_text(text_field, "c"),
]);
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
let phrase_weight = phrase_query
.phrase_prefix_query_weight(enable_scoring)
.unwrap()
.unwrap();
let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
.unwrap();
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 2);
assert_eq!(phrase_scorer.advance(), 2);
assert_eq!(phrase_scorer.doc(), 2);
assert_eq!(phrase_scorer.phrase_count(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}
#[test]
pub fn test_phrase_count_mid() -> crate::Result<()> {
let index = create_index(&["aa dd cc", "aa aa bb c dd aa bb cc aa dc", " aa bb cd"])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let phrase_query = PhrasePrefixQuery::new(vec![
Term::from_field_text(text_field, "aa"),
Term::from_field_text(text_field, "b"),
]);
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
let phrase_weight = phrase_query
.phrase_prefix_query_weight(enable_scoring)
.unwrap()
.unwrap();
let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
.unwrap();
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 2);
assert_eq!(phrase_scorer.advance(), 2);
assert_eq!(phrase_scorer.doc(), 2);
assert_eq!(phrase_scorer.phrase_count(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}
#[test]
pub fn test_phrase_count_short() -> crate::Result<()> {
let index = create_index(&["aa dd", "aa aa bb c dd aa bb cc aa dc", " aa bb cd"])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let phrase_query = PhrasePrefixQuery::new(vec![Term::from_field_text(text_field, "c")]);
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
assert!(phrase_query
.phrase_prefix_query_weight(enable_scoring)
.unwrap()
.is_none());
let weight = phrase_query.weight(enable_scoring).unwrap();
let mut phrase_scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.advance(), 2);
assert_eq!(phrase_scorer.doc(), 2);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}
}

View File

@@ -3,6 +3,7 @@ mod phrase_scorer;
mod phrase_weight;
pub use self::phrase_query::PhraseQuery;
pub(crate) use self::phrase_scorer::intersection_count;
pub use self::phrase_scorer::PhraseScorer;
pub use self::phrase_weight::PhraseWeight;

View File

@@ -76,7 +76,7 @@ fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
false
}
fn intersection_count(left: &[u32], right: &[u32]) -> usize {
pub(crate) fn intersection_count(left: &[u32], right: &[u32]) -> usize {
let mut left_index = 0;
let mut right_index = 0;
let mut count = 0;
@@ -250,12 +250,29 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
similarity_weight_opt: Option<Bm25Weight>,
fieldnorm_reader: FieldNormReader,
slop: u32,
) -> PhraseScorer<TPostings> {
Self::new_with_offset(
term_postings,
similarity_weight_opt,
fieldnorm_reader,
slop,
0,
)
}
pub(crate) fn new_with_offset(
term_postings: Vec<(usize, TPostings)>,
similarity_weight_opt: Option<Bm25Weight>,
fieldnorm_reader: FieldNormReader,
slop: u32,
offset: usize,
) -> PhraseScorer<TPostings> {
let max_offset = term_postings
.iter()
.map(|&(offset, _)| offset)
.max()
.unwrap_or(0);
.unwrap_or(0)
+ offset;
let num_docsets = term_postings.len();
let postings_with_offsets = term_postings
.into_iter()
@@ -283,6 +300,11 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
self.phrase_count
}
pub(crate) fn get_intersection(&mut self) -> &[u32] {
let len = intersection(&mut self.left, &self.right);
&self.left[..len]
}
fn phrase_match(&mut self) -> bool {
if self.similarity_weight_opt.is_some() {
let count = self.compute_phrase_count();