mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 08:12:54 +00:00
@@ -12,6 +12,7 @@ mod explanation;
|
||||
mod fuzzy_query;
|
||||
mod intersection;
|
||||
mod more_like_this;
|
||||
mod phrase_prefix_query;
|
||||
mod phrase_query;
|
||||
mod query;
|
||||
mod query_parser;
|
||||
@@ -47,6 +48,7 @@ pub(crate) use self::fuzzy_query::DfaWrapper;
|
||||
pub use self::fuzzy_query::FuzzyTermQuery;
|
||||
pub use self::intersection::{intersect_scorers, Intersection};
|
||||
pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
|
||||
pub use self::phrase_prefix_query::PhrasePrefixQuery;
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::query::{EnableScoring, Query, QueryClone};
|
||||
pub use self::query_parser::{QueryParser, QueryParserError};
|
||||
|
||||
34
src/query/phrase_prefix_query/mod.rs
Normal file
34
src/query/phrase_prefix_query/mod.rs
Normal file
@@ -0,0 +1,34 @@
|
||||
mod phrase_prefix_query;
|
||||
mod phrase_prefix_scorer;
|
||||
mod phrase_prefix_weight;
|
||||
|
||||
pub use phrase_prefix_query::PhrasePrefixQuery;
|
||||
pub use phrase_prefix_scorer::PhrasePrefixScorer;
|
||||
pub use phrase_prefix_weight::PhrasePrefixWeight;
|
||||
|
||||
fn prefix_end(prefix_start: &[u8]) -> Option<Vec<u8>> {
|
||||
let mut res = prefix_start.to_owned();
|
||||
while !res.is_empty() {
|
||||
let end = res.len() - 1;
|
||||
if res[end] == u8::MAX {
|
||||
res.pop();
|
||||
} else {
|
||||
res[end] += 1;
|
||||
return Some(res);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_prefix_end() {
|
||||
assert_eq!(prefix_end(b"aaa"), Some(b"aab".to_vec()));
|
||||
assert_eq!(prefix_end(b"aa\xff"), Some(b"ab".to_vec()));
|
||||
assert_eq!(prefix_end(b"a\xff\xff"), Some(b"b".to_vec()));
|
||||
assert_eq!(prefix_end(b"\xff\xff\xff"), None);
|
||||
}
|
||||
}
|
||||
168
src/query/phrase_prefix_query/phrase_prefix_query.rs
Normal file
168
src/query/phrase_prefix_query/phrase_prefix_query.rs
Normal file
@@ -0,0 +1,168 @@
|
||||
use std::ops::Bound;
|
||||
|
||||
use super::{prefix_end, PhrasePrefixWeight};
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
use crate::query::{EnableScoring, Query, RangeQuery, Weight};
|
||||
use crate::schema::{Field, IndexRecordOption, Term};
|
||||
|
||||
const DEFAULT_MAX_EXPANSIONS: u32 = 50;
|
||||
|
||||
/// `PhrasePrefixQuery` matches a specific sequence of words followed by term of which only a
|
||||
/// prefix is known.
|
||||
///
|
||||
/// For instance the phrase prefix query for `"part t"` will match
|
||||
/// the sentence
|
||||
///
|
||||
/// **Alan just got a part time job.**
|
||||
///
|
||||
/// On the other hand it will not match the sentence.
|
||||
///
|
||||
/// **This is my favorite part of the job.**
|
||||
///
|
||||
/// Using a `PhrasePrefixQuery` on a field requires positions
|
||||
/// to be indexed for this field.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PhrasePrefixQuery {
|
||||
field: Field,
|
||||
phrase_terms: Vec<(usize, Term)>,
|
||||
prefix: (usize, Term),
|
||||
max_expansions: u32,
|
||||
}
|
||||
|
||||
impl PhrasePrefixQuery {
|
||||
/// Creates a new `PhrasePrefixQuery` given a list of terms.
|
||||
///
|
||||
/// There must be at least two terms, and all terms
|
||||
/// must belong to the same field.
|
||||
/// Offset for each term will be same as index in the Vector
|
||||
/// The last Term is a prefix and not a full value
|
||||
pub fn new(terms: Vec<Term>) -> PhrasePrefixQuery {
|
||||
let terms_with_offset = terms.into_iter().enumerate().collect();
|
||||
PhrasePrefixQuery::new_with_offset(terms_with_offset)
|
||||
}
|
||||
|
||||
/// Creates a new `PhrasePrefixQuery` given a list of terms and their offsets.
|
||||
///
|
||||
/// Can be used to provide custom offset for each term.
|
||||
pub fn new_with_offset(mut terms: Vec<(usize, Term)>) -> PhrasePrefixQuery {
|
||||
assert!(
|
||||
!terms.is_empty(),
|
||||
"A phrase prefix query is required to have at least one term."
|
||||
);
|
||||
terms.sort_by_key(|&(offset, _)| offset);
|
||||
let field = terms[0].1.field();
|
||||
assert!(
|
||||
terms[1..].iter().all(|term| term.1.field() == field),
|
||||
"All terms from a phrase query must belong to the same field"
|
||||
);
|
||||
PhrasePrefixQuery {
|
||||
field,
|
||||
prefix: terms.pop().unwrap(),
|
||||
phrase_terms: terms,
|
||||
max_expansions: DEFAULT_MAX_EXPANSIONS,
|
||||
}
|
||||
}
|
||||
|
||||
/// Maximum number of terms to which the last provided term will expand.
|
||||
pub fn set_max_expansions(&mut self, value: u32) {
|
||||
self.max_expansions = value;
|
||||
}
|
||||
|
||||
/// The [`Field`] this `PhrasePrefixQuery` is targeting.
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
|
||||
/// `Term`s in the phrase without the associated offsets.
|
||||
pub fn phrase_terms(&self) -> Vec<Term> {
|
||||
// TODO should we include the last term too?
|
||||
self.phrase_terms
|
||||
.iter()
|
||||
.map(|(_, term)| term.clone())
|
||||
.collect::<Vec<Term>>()
|
||||
}
|
||||
|
||||
/// Returns the [`PhrasePrefixWeight`] for the given phrase query given a specific `searcher`.
|
||||
///
|
||||
/// This function is the same as [`Query::weight()`] except it returns
|
||||
/// a specialized type [`PhraseQueryWeight`] instead of a Boxed trait.
|
||||
/// If the query was only one term long, this returns `None` wherease [`Query::weight`]
|
||||
/// returns a boxed [`RangeWeight`]
|
||||
///
|
||||
/// Returns `None`, if phrase_terms is empty, which happens if the phrase prefix query was
|
||||
/// built with a single term.
|
||||
pub(crate) fn phrase_prefix_query_weight(
|
||||
&self,
|
||||
enable_scoring: EnableScoring<'_>,
|
||||
) -> crate::Result<Option<PhrasePrefixWeight>> {
|
||||
if self.phrase_terms.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
let schema = enable_scoring.schema();
|
||||
let field_entry = schema.get_field_entry(self.field);
|
||||
let has_positions = field_entry
|
||||
.field_type()
|
||||
.get_index_record_option()
|
||||
.map(IndexRecordOption::has_positions)
|
||||
.unwrap_or(false);
|
||||
if !has_positions {
|
||||
let field_name = field_entry.name();
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"Applied phrase query on field {:?}, which does not have positions indexed",
|
||||
field_name
|
||||
)));
|
||||
}
|
||||
let terms = self.phrase_terms();
|
||||
let bm25_weight_opt = match enable_scoring {
|
||||
EnableScoring::Enabled { searcher, .. } => {
|
||||
Some(Bm25Weight::for_terms(searcher, &terms)?)
|
||||
}
|
||||
EnableScoring::Disabled { .. } => None,
|
||||
};
|
||||
let weight = PhrasePrefixWeight::new(
|
||||
self.phrase_terms.clone(),
|
||||
self.prefix.clone(),
|
||||
bm25_weight_opt,
|
||||
self.max_expansions,
|
||||
);
|
||||
Ok(Some(weight))
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for PhrasePrefixQuery {
|
||||
/// Create the weight associated with a query.
|
||||
///
|
||||
/// See [`Weight`].
|
||||
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
|
||||
if let Some(phrase_weight) = self.phrase_prefix_query_weight(enable_scoring)? {
|
||||
Ok(Box::new(phrase_weight))
|
||||
} else {
|
||||
// There are no prefix. Let's just match the suffix.
|
||||
let end_term = if let Some(end_value) = prefix_end(&self.prefix.1.value_bytes()) {
|
||||
let mut end_term = Term::with_capacity(end_value.len());
|
||||
end_term.set_field_and_type(self.field, self.prefix.1.typ());
|
||||
end_term.append_bytes(&end_value);
|
||||
Bound::Excluded(end_term)
|
||||
} else {
|
||||
Bound::Unbounded
|
||||
};
|
||||
|
||||
RangeQuery::new_term_bounds(
|
||||
enable_scoring
|
||||
.schema()
|
||||
.get_field_name(self.field)
|
||||
.to_owned(),
|
||||
self.prefix.1.typ(),
|
||||
&Bound::Included(self.prefix.1.clone()),
|
||||
&end_term,
|
||||
)
|
||||
.weight(enable_scoring)
|
||||
}
|
||||
}
|
||||
|
||||
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
|
||||
for (_, term) in &self.phrase_terms {
|
||||
visitor(term, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
207
src/query/phrase_prefix_query/phrase_prefix_scorer.rs
Normal file
207
src/query/phrase_prefix_query/phrase_prefix_scorer.rs
Normal file
@@ -0,0 +1,207 @@
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::Postings;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
use crate::query::phrase_query::{intersection_count, PhraseScorer};
|
||||
use crate::query::Scorer;
|
||||
use crate::{DocId, Score};
|
||||
|
||||
enum PhraseKind<TPostings: Postings> {
|
||||
SinglePrefix {
|
||||
position_offset: u32,
|
||||
postings: TPostings,
|
||||
positions: Vec<u32>,
|
||||
},
|
||||
MultiPrefix(PhraseScorer<TPostings>),
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> PhraseKind<TPostings> {
|
||||
fn get_intersection(&mut self) -> &[u32] {
|
||||
match self {
|
||||
PhraseKind::SinglePrefix {
|
||||
position_offset,
|
||||
postings,
|
||||
positions,
|
||||
} => {
|
||||
if positions.is_empty() {
|
||||
postings.positions_with_offset(*position_offset, positions);
|
||||
}
|
||||
positions
|
||||
}
|
||||
PhraseKind::MultiPrefix(postings) => postings.get_intersection(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> DocSet for PhraseKind<TPostings> {
|
||||
fn advance(&mut self) -> DocId {
|
||||
match self {
|
||||
PhraseKind::SinglePrefix {
|
||||
postings,
|
||||
positions,
|
||||
..
|
||||
} => {
|
||||
positions.clear();
|
||||
postings.advance()
|
||||
}
|
||||
PhraseKind::MultiPrefix(postings) => postings.advance(),
|
||||
}
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
match self {
|
||||
PhraseKind::SinglePrefix { postings, .. } => postings.doc(),
|
||||
PhraseKind::MultiPrefix(postings) => postings.doc(),
|
||||
}
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
match self {
|
||||
PhraseKind::SinglePrefix { postings, .. } => postings.size_hint(),
|
||||
PhraseKind::MultiPrefix(postings) => postings.size_hint(),
|
||||
}
|
||||
}
|
||||
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
match self {
|
||||
PhraseKind::SinglePrefix {
|
||||
postings,
|
||||
positions,
|
||||
..
|
||||
} => {
|
||||
positions.clear();
|
||||
postings.seek(target)
|
||||
}
|
||||
PhraseKind::MultiPrefix(postings) => postings.seek(target),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> Scorer for PhraseKind<TPostings> {
|
||||
fn score(&mut self) -> Score {
|
||||
match self {
|
||||
PhraseKind::SinglePrefix { positions, .. } => {
|
||||
if positions.is_empty() {
|
||||
0.0
|
||||
} else {
|
||||
1.0
|
||||
}
|
||||
}
|
||||
PhraseKind::MultiPrefix(postings) => postings.score(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PhrasePrefixScorer<TPostings: Postings> {
|
||||
phrase_scorer: PhraseKind<TPostings>,
|
||||
suffixes: Vec<TPostings>,
|
||||
suffix_offset: u32,
|
||||
phrase_count: u32,
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
||||
// If similarity_weight is None, then scoring is disabled.
|
||||
pub fn new(
|
||||
mut term_postings: Vec<(usize, TPostings)>,
|
||||
similarity_weight_opt: Option<Bm25Weight>,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
suffixes: Vec<TPostings>,
|
||||
suffix_pos: usize,
|
||||
) -> PhrasePrefixScorer<TPostings> {
|
||||
// correct indices so we can merge with our suffix term the PhraseScorer doesn't know about
|
||||
let max_offset = term_postings
|
||||
.iter()
|
||||
.map(|(pos, _)| *pos)
|
||||
.chain(std::iter::once(suffix_pos))
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
let phrase_scorer = if term_postings.len() > 1 {
|
||||
PhraseKind::MultiPrefix(PhraseScorer::new_with_offset(
|
||||
term_postings,
|
||||
similarity_weight_opt,
|
||||
fieldnorm_reader,
|
||||
0,
|
||||
1,
|
||||
))
|
||||
} else {
|
||||
let (pos, postings) = term_postings
|
||||
.pop()
|
||||
.expect("PhrasePrefixScorer must have at least two terms");
|
||||
let offset = suffix_pos - pos;
|
||||
PhraseKind::SinglePrefix {
|
||||
position_offset: offset as u32,
|
||||
postings,
|
||||
positions: Vec::with_capacity(100),
|
||||
}
|
||||
};
|
||||
let mut phrase_prefix_scorer = PhrasePrefixScorer {
|
||||
phrase_scorer,
|
||||
suffixes,
|
||||
suffix_offset: (max_offset - suffix_pos) as u32,
|
||||
phrase_count: 0,
|
||||
};
|
||||
if !phrase_prefix_scorer.matches_prefix() {
|
||||
phrase_prefix_scorer.advance();
|
||||
}
|
||||
phrase_prefix_scorer
|
||||
}
|
||||
|
||||
pub fn phrase_count(&self) -> u32 {
|
||||
self.phrase_count
|
||||
}
|
||||
|
||||
fn matches_prefix(&mut self) -> bool {
|
||||
let mut count = 0;
|
||||
let mut positions = Vec::new();
|
||||
let current_doc = self.doc();
|
||||
let pos_matching = self.phrase_scorer.get_intersection();
|
||||
for suffix in &mut self.suffixes {
|
||||
if suffix.doc() > current_doc {
|
||||
continue;
|
||||
}
|
||||
let doc = suffix.seek(current_doc);
|
||||
if doc == current_doc {
|
||||
suffix.positions_with_offset(self.suffix_offset, &mut positions);
|
||||
count += intersection_count(pos_matching, &positions);
|
||||
}
|
||||
}
|
||||
self.phrase_count = count as u32;
|
||||
count != 0
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
|
||||
fn advance(&mut self) -> DocId {
|
||||
loop {
|
||||
let doc = self.phrase_scorer.advance();
|
||||
if doc == TERMINATED || self.matches_prefix() {
|
||||
return doc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
self.phrase_scorer.seek(target);
|
||||
let doc = self.phrase_scorer.seek(target);
|
||||
if doc == TERMINATED || self.matches_prefix() {
|
||||
return doc;
|
||||
}
|
||||
self.advance()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.phrase_scorer.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.phrase_scorer.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> Scorer for PhrasePrefixScorer<TPostings> {
|
||||
fn score(&mut self) -> Score {
|
||||
// TODO modify score??
|
||||
self.phrase_scorer.score()
|
||||
}
|
||||
}
|
||||
260
src/query/phrase_prefix_query/phrase_prefix_weight.rs
Normal file
260
src/query/phrase_prefix_query/phrase_prefix_weight.rs
Normal file
@@ -0,0 +1,260 @@
|
||||
use super::{prefix_end, PhrasePrefixScorer};
|
||||
use crate::core::SegmentReader;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::SegmentPostings;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::{EmptyScorer, Explanation, Scorer, Weight};
|
||||
use crate::schema::{IndexRecordOption, Term};
|
||||
use crate::{DocId, DocSet, Score};
|
||||
|
||||
pub struct PhrasePrefixWeight {
|
||||
phrase_terms: Vec<(usize, Term)>,
|
||||
prefix: (usize, Term),
|
||||
similarity_weight_opt: Option<Bm25Weight>,
|
||||
max_expansions: u32,
|
||||
}
|
||||
|
||||
impl PhrasePrefixWeight {
|
||||
/// Creates a new phrase weight.
|
||||
/// If `similarity_weight_opt` is None, then scoring is disabled
|
||||
pub fn new(
|
||||
phrase_terms: Vec<(usize, Term)>,
|
||||
prefix: (usize, Term),
|
||||
similarity_weight_opt: Option<Bm25Weight>,
|
||||
max_expansions: u32,
|
||||
) -> PhrasePrefixWeight {
|
||||
PhrasePrefixWeight {
|
||||
phrase_terms,
|
||||
prefix,
|
||||
similarity_weight_opt,
|
||||
max_expansions,
|
||||
}
|
||||
}
|
||||
|
||||
fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> {
|
||||
let field = self.phrase_terms[0].1.field();
|
||||
if self.similarity_weight_opt.is_some() {
|
||||
if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? {
|
||||
return Ok(fieldnorm_reader);
|
||||
}
|
||||
}
|
||||
Ok(FieldNormReader::constant(reader.max_doc(), 1))
|
||||
}
|
||||
|
||||
pub(crate) fn phrase_scorer(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
boost: Score,
|
||||
) -> crate::Result<Option<PhrasePrefixScorer<SegmentPostings>>> {
|
||||
let similarity_weight_opt = self
|
||||
.similarity_weight_opt
|
||||
.as_ref()
|
||||
.map(|similarity_weight| similarity_weight.boost_by(boost));
|
||||
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
|
||||
let mut term_postings_list = Vec::new();
|
||||
if reader.has_deletes() {
|
||||
for &(offset, ref term) in &self.phrase_terms {
|
||||
if let Some(postings) = reader
|
||||
.inverted_index(term.field())?
|
||||
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
|
||||
{
|
||||
term_postings_list.push((offset, postings));
|
||||
} else {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for &(offset, ref term) in &self.phrase_terms {
|
||||
if let Some(postings) = reader
|
||||
.inverted_index(term.field())?
|
||||
.read_postings_no_deletes(term, IndexRecordOption::WithFreqsAndPositions)?
|
||||
{
|
||||
term_postings_list.push((offset, postings));
|
||||
} else {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let inv_index = reader.inverted_index(self.prefix.1.field())?;
|
||||
let mut stream = inv_index.terms().range().ge(self.prefix.1.value_bytes());
|
||||
if let Some(end) = prefix_end(self.prefix.1.value_bytes()) {
|
||||
stream = stream.lt(&end);
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
{
|
||||
// We don't have this on the fst, hence we end up needing a feature flag.
|
||||
//
|
||||
// This is not a problem however as we enforce the limit below too.
|
||||
// The point of `stream.limit` is to limit the number of term dictionary
|
||||
// blocks being downloaded.
|
||||
stream = stream.limit(self.max_expansions as u64);
|
||||
}
|
||||
|
||||
let mut stream = stream.into_stream()?;
|
||||
|
||||
let mut suffixes = Vec::with_capacity(self.max_expansions as usize);
|
||||
let mut new_term = self.prefix.1.clone();
|
||||
while stream.advance() && (suffixes.len() as u32) < self.max_expansions {
|
||||
new_term.clear_with_type(new_term.typ());
|
||||
new_term.append_bytes(stream.key());
|
||||
if reader.has_deletes() {
|
||||
if let Some(postings) =
|
||||
inv_index.read_postings(&new_term, IndexRecordOption::WithFreqsAndPositions)?
|
||||
{
|
||||
suffixes.push(postings);
|
||||
}
|
||||
} else {
|
||||
if let Some(postings) = inv_index
|
||||
.read_postings_no_deletes(&new_term, IndexRecordOption::WithFreqsAndPositions)?
|
||||
{
|
||||
suffixes.push(postings);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Some(PhrasePrefixScorer::new(
|
||||
term_postings_list,
|
||||
similarity_weight_opt,
|
||||
fieldnorm_reader,
|
||||
suffixes,
|
||||
self.prefix.0,
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for PhrasePrefixWeight {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
if let Some(scorer) = self.phrase_scorer(reader, boost)? {
|
||||
Ok(Box::new(scorer))
|
||||
} else {
|
||||
Ok(Box::new(EmptyScorer))
|
||||
}
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
let scorer_opt = self.phrase_scorer(reader, 1.0)?;
|
||||
if scorer_opt.is_none() {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
let mut scorer = scorer_opt.unwrap();
|
||||
if scorer.seek(doc) != doc {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
|
||||
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
|
||||
let phrase_count = scorer.phrase_count();
|
||||
let mut explanation = Explanation::new("Phrase Prefix Scorer", scorer.score());
|
||||
if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() {
|
||||
explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count));
|
||||
}
|
||||
Ok(explanation)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::core::Index;
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::query::{EnableScoring, PhrasePrefixQuery, Query};
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::{DocSet, Term};
|
||||
|
||||
pub fn create_index(texts: &[&'static str]) -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for &text in texts {
|
||||
let doc = doc!(text_field=>text);
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
}
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_count_long() -> crate::Result<()> {
|
||||
let index = create_index(&[
|
||||
"aa bb dd cc",
|
||||
"aa aa bb c dd aa bb cc aa bb dc",
|
||||
" aa bb cd",
|
||||
])?;
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let phrase_query = PhrasePrefixQuery::new(vec![
|
||||
Term::from_field_text(text_field, "aa"),
|
||||
Term::from_field_text(text_field, "bb"),
|
||||
Term::from_field_text(text_field, "c"),
|
||||
]);
|
||||
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
|
||||
let phrase_weight = phrase_query
|
||||
.phrase_prefix_query_weight(enable_scoring)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
let mut phrase_scorer = phrase_weight
|
||||
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
|
||||
.unwrap();
|
||||
assert_eq!(phrase_scorer.doc(), 1);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 2);
|
||||
assert_eq!(phrase_scorer.advance(), 2);
|
||||
assert_eq!(phrase_scorer.doc(), 2);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 1);
|
||||
assert_eq!(phrase_scorer.advance(), TERMINATED);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_count_mid() -> crate::Result<()> {
|
||||
let index = create_index(&["aa dd cc", "aa aa bb c dd aa bb cc aa dc", " aa bb cd"])?;
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let phrase_query = PhrasePrefixQuery::new(vec![
|
||||
Term::from_field_text(text_field, "aa"),
|
||||
Term::from_field_text(text_field, "b"),
|
||||
]);
|
||||
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
|
||||
let phrase_weight = phrase_query
|
||||
.phrase_prefix_query_weight(enable_scoring)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
let mut phrase_scorer = phrase_weight
|
||||
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
|
||||
.unwrap();
|
||||
assert_eq!(phrase_scorer.doc(), 1);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 2);
|
||||
assert_eq!(phrase_scorer.advance(), 2);
|
||||
assert_eq!(phrase_scorer.doc(), 2);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 1);
|
||||
assert_eq!(phrase_scorer.advance(), TERMINATED);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_count_short() -> crate::Result<()> {
|
||||
let index = create_index(&["aa dd", "aa aa bb c dd aa bb cc aa dc", " aa bb cd"])?;
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let phrase_query = PhrasePrefixQuery::new(vec![Term::from_field_text(text_field, "c")]);
|
||||
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
|
||||
assert!(phrase_query
|
||||
.phrase_prefix_query_weight(enable_scoring)
|
||||
.unwrap()
|
||||
.is_none());
|
||||
let weight = phrase_query.weight(enable_scoring).unwrap();
|
||||
let mut phrase_scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
|
||||
assert_eq!(phrase_scorer.doc(), 1);
|
||||
assert_eq!(phrase_scorer.advance(), 2);
|
||||
assert_eq!(phrase_scorer.doc(), 2);
|
||||
assert_eq!(phrase_scorer.advance(), TERMINATED);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -3,6 +3,7 @@ mod phrase_scorer;
|
||||
mod phrase_weight;
|
||||
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub(crate) use self::phrase_scorer::intersection_count;
|
||||
pub use self::phrase_scorer::PhraseScorer;
|
||||
pub use self::phrase_weight::PhraseWeight;
|
||||
|
||||
|
||||
@@ -76,7 +76,7 @@ fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn intersection_count(left: &[u32], right: &[u32]) -> usize {
|
||||
pub(crate) fn intersection_count(left: &[u32], right: &[u32]) -> usize {
|
||||
let mut left_index = 0;
|
||||
let mut right_index = 0;
|
||||
let mut count = 0;
|
||||
@@ -250,12 +250,29 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
similarity_weight_opt: Option<Bm25Weight>,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
slop: u32,
|
||||
) -> PhraseScorer<TPostings> {
|
||||
Self::new_with_offset(
|
||||
term_postings,
|
||||
similarity_weight_opt,
|
||||
fieldnorm_reader,
|
||||
slop,
|
||||
0,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn new_with_offset(
|
||||
term_postings: Vec<(usize, TPostings)>,
|
||||
similarity_weight_opt: Option<Bm25Weight>,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
slop: u32,
|
||||
offset: usize,
|
||||
) -> PhraseScorer<TPostings> {
|
||||
let max_offset = term_postings
|
||||
.iter()
|
||||
.map(|&(offset, _)| offset)
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
.unwrap_or(0)
|
||||
+ offset;
|
||||
let num_docsets = term_postings.len();
|
||||
let postings_with_offsets = term_postings
|
||||
.into_iter()
|
||||
@@ -283,6 +300,11 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
self.phrase_count
|
||||
}
|
||||
|
||||
pub(crate) fn get_intersection(&mut self) -> &[u32] {
|
||||
let len = intersection(&mut self.left, &self.right);
|
||||
&self.left[..len]
|
||||
}
|
||||
|
||||
fn phrase_match(&mut self) -> bool {
|
||||
if self.similarity_weight_opt.is_some() {
|
||||
let count = self.compute_phrase_count();
|
||||
|
||||
Reference in New Issue
Block a user