Slop support for phrase queries (#1241)

Closes #1068
This commit is contained in:
Halvor Fladsrud Bø
2022-03-07 06:29:18 +00:00
committed by GitHub
parent d31f045872
commit cedced5bb0
4 changed files with 255 additions and 60 deletions

View File

@@ -181,6 +181,90 @@ pub mod tests {
Ok(())
}
#[ignore]
#[test]
pub fn test_phrase_score_with_slop() -> crate::Result<()> {
let index = create_index(&["a c b", "a b c a b"])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader().unwrap().searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let mut phrase_query = PhraseQuery::new(terms);
phrase_query.set_slop(1);
searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.expect("search should succeed")
.scores()
.to_vec()
};
let scores = test_query(vec!["a", "b"]);
assert_nearly_equals!(scores[0], 0.40618482);
assert_nearly_equals!(scores[1], 0.46844664);
Ok(())
}
#[test]
pub fn test_phrase_score_with_slop_size() -> crate::Result<()> {
let index = create_index(&["a b e c", "a e e e c", "a e e e e c"])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader().unwrap().searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let mut phrase_query = PhraseQuery::new(terms);
phrase_query.set_slop(3);
searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.expect("search should succeed")
.scores()
.to_vec()
};
let scores = test_query(vec!["a", "c"]);
assert_nearly_equals!(scores[0], 0.29086056);
assert_nearly_equals!(scores[1], 0.26706287);
Ok(())
}
#[test]
pub fn test_phrase_score_with_slop_ordering() -> crate::Result<()> {
let index = create_index(&[
"a e b e c",
"a e e e e e b e e e e c",
"a c b",
"a c e b e",
"a e c b",
"a e b c",
])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader().unwrap().searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let mut phrase_query = PhraseQuery::new(terms);
phrase_query.set_slop(3);
searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.expect("search should succeed")
.scores()
.to_vec()
};
let scores = test_query(vec!["a", "b", "c"]);
// The first and last matches.
assert_nearly_equals!(scores[0], 0.23091172);
assert_nearly_equals!(scores[1], 0.25024384);
Ok(())
}
#[test] // motivated by #234
pub fn test_phrase_query_docfreq_order() -> crate::Result<()> {
let mut schema_builder = Schema::builder();

View File

@@ -23,6 +23,7 @@ use crate::schema::{Field, IndexRecordOption, Term};
pub struct PhraseQuery {
field: Field,
phrase_terms: Vec<(usize, Term)>,
slop: u32,
}
impl PhraseQuery {
@@ -53,9 +54,15 @@ impl PhraseQuery {
PhraseQuery {
field,
phrase_terms: terms,
slop: 0,
}
}
/// Slop allowed for the phrase.
pub fn set_slop(&mut self, value: u32) {
self.slop = value;
}
/// The `Field` this `PhraseQuery` is targeting.
pub fn field(&self) -> Field {
self.field
@@ -94,11 +101,11 @@ impl PhraseQuery {
}
let terms = self.phrase_terms();
let bm25_weight = Bm25Weight::for_terms(searcher, &terms)?;
Ok(PhraseWeight::new(
self.phrase_terms.clone(),
bm25_weight,
scoring_enabled,
))
let mut weight = PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, scoring_enabled);
if self.slop > 0 {
weight.slop(self.slop);
}
Ok(weight)
}
}

View File

@@ -52,24 +52,25 @@ pub struct PhraseScorer<TPostings: Postings> {
fieldnorm_reader: FieldNormReader,
similarity_weight: Bm25Weight,
scoring_enabled: bool,
slop: u32,
}
/// Returns true if and only if the two sorted arrays contain a common element
fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
let mut left_i = 0;
let mut right_i = 0;
while left_i < left.len() && right_i < right.len() {
let left_val = left[left_i];
let right_val = right[right_i];
let mut left_index = 0;
let mut right_index = 0;
while left_index < left.len() && right_index < right.len() {
let left_val = left[left_index];
let right_val = right[right_index];
match left_val.cmp(&right_val) {
Ordering::Less => {
left_i += 1;
left_index += 1;
}
Ordering::Equal => {
return true;
}
Ordering::Greater => {
right_i += 1;
right_index += 1;
}
}
}
@@ -77,23 +78,23 @@ fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
}
fn intersection_count(left: &[u32], right: &[u32]) -> usize {
let mut left_i = 0;
let mut right_i = 0;
let mut left_index = 0;
let mut right_index = 0;
let mut count = 0;
while left_i < left.len() && right_i < right.len() {
let left_val = left[left_i];
let right_val = right[right_i];
while left_index < left.len() && right_index < right.len() {
let left_val = left[left_index];
let right_val = right[right_index];
match left_val.cmp(&right_val) {
Ordering::Less => {
left_i += 1;
left_index += 1;
}
Ordering::Equal => {
count += 1;
left_i += 1;
right_i += 1;
left_index += 1;
right_index += 1;
}
Ordering::Greater => {
right_i += 1;
right_index += 1;
}
}
}
@@ -105,38 +106,91 @@ fn intersection_count(left: &[u32], right: &[u32]) -> usize {
///
/// Returns the length of the intersection
fn intersection(left: &mut [u32], right: &[u32]) -> usize {
let mut left_i = 0;
let mut right_i = 0;
let mut left_index = 0;
let mut right_index = 0;
let mut count = 0;
let left_len = left.len();
let right_len = right.len();
while left_i < left_len && right_i < right_len {
let left_val = left[left_i];
let right_val = right[right_i];
while left_index < left_len && right_index < right_len {
let left_val = left[left_index];
let right_val = right[right_index];
match left_val.cmp(&right_val) {
Ordering::Less => {
left_i += 1;
left_index += 1;
}
Ordering::Equal => {
left[count] = left_val;
count += 1;
left_i += 1;
right_i += 1;
left_index += 1;
right_index += 1;
}
Ordering::Greater => {
right_i += 1;
right_index += 1;
}
}
}
count
}
/// Intersect twos sorted arrays `left` and `right` and outputs the
/// resulting array in left.
///
/// Condition for match is that the value stored in left is less than or equal to
/// the value in right and that the distance to the previous token is lte to the slop.
///
/// Returns the length of the intersection
fn intersection_with_slop(left: &mut [u32], right: &[u32], slop: u32) -> usize {
let mut left_index = 0;
let mut right_index = 0;
let mut count = 0;
let left_len = left.len();
let right_len = right.len();
while left_index < left_len && right_index < right_len {
let left_val = left[left_index];
let right_val = right[right_index];
// The three conditions are:
// left_val < right_slop -> left index increment.
// right_slop <= left_val <= right -> find the best match.
// left_val > right -> right index increment.
let right_slop = if right_val >= slop {
right_val - slop
} else {
0
};
if left_val < right_slop {
left_index += 1;
} else if right_slop <= left_val && left_val <= right_val {
while left_index + 1 < left_len {
// there could be a better match
let next_left_val = left[left_index + 1];
if next_left_val > right_val {
// the next value is outside the range, so current one is the best.
break;
}
// the next value is better.
left_index += 1;
}
// store the match in left.
left[count] = right_val;
count += 1;
left_index += 1;
right_index += 1;
} else if left_val > right_val {
right_index += 1;
}
}
count
}
impl<TPostings: Postings> PhraseScorer<TPostings> {
pub fn new(
term_postings: Vec<(usize, TPostings)>,
similarity_weight: Bm25Weight,
fieldnorm_reader: FieldNormReader,
scoring_enabled: bool,
slop: u32,
) -> PhraseScorer<TPostings> {
let max_offset = term_postings
.iter()
@@ -159,6 +213,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
similarity_weight,
fieldnorm_reader,
scoring_enabled,
slop,
};
if scorer.doc() != TERMINATED && !scorer.phrase_match() {
scorer.advance();
@@ -181,51 +236,54 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
}
fn phrase_exists(&mut self) -> bool {
self.intersection_docset
.docset_mut_specialized(0)
.positions(&mut self.left);
let mut intersection_len = self.left.len();
for i in 1..self.num_terms - 1 {
{
self.intersection_docset
.docset_mut_specialized(i)
.positions(&mut self.right);
}
intersection_len = intersection(&mut self.left[..intersection_len], &self.right[..]);
if intersection_len == 0 {
return false;
}
}
self.intersection_docset
.docset_mut_specialized(self.num_terms - 1)
.positions(&mut self.right);
let intersection_len = self.compute_phrase_match();
intersection_exists(&self.left[..intersection_len], &self.right[..])
}
fn compute_phrase_count(&mut self) -> u32 {
let intersection_len = self.compute_phrase_match();
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
}
fn compute_phrase_match(&mut self) -> usize {
{
self.intersection_docset
.docset_mut_specialized(0)
.positions(&mut self.left);
}
let mut intersection_len = self.left.len();
for i in 1..self.num_terms - 1 {
let end_term = if self.has_slop() {
self.num_terms
} else {
self.num_terms - 1
};
for i in 1..end_term {
{
self.intersection_docset
.docset_mut_specialized(i)
.positions(&mut self.right);
}
intersection_len = intersection(&mut self.left[..intersection_len], &self.right[..]);
intersection_len = if self.has_slop() {
intersection_with_slop(
&mut self.left[..intersection_len],
&self.right[..],
self.slop,
)
} else {
intersection(&mut self.left[..intersection_len], &self.right[..])
};
if intersection_len == 0 {
return 0u32;
return 0;
}
}
self.intersection_docset
.docset_mut_specialized(self.num_terms - 1)
.positions(&mut self.right);
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
intersection_len
}
fn has_slop(&self) -> bool {
self.slop > 0
}
}
@@ -268,18 +326,26 @@ impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {
#[cfg(test)]
mod tests {
use super::{intersection, intersection_count};
use super::{intersection, intersection_count, intersection_with_slop};
fn test_intersection_sym(left: &[u32], right: &[u32], expected: &[u32]) {
test_intersection_aux(left, right, expected);
test_intersection_aux(right, left, expected);
test_intersection_aux(left, right, expected, 0);
test_intersection_aux(right, left, expected, 0);
}
fn test_intersection_aux(left: &[u32], right: &[u32], expected: &[u32]) {
fn test_intersection_aux(left: &[u32], right: &[u32], expected: &[u32], slop: u32) {
let mut left_vec = Vec::from(left);
let left_mut = &mut left_vec[..];
assert_eq!(intersection_count(left_mut, right), expected.len());
let count = intersection(left_mut, right);
if slop == 0 {
let left_mut = &mut left_vec[..];
assert_eq!(intersection_count(left_mut, right), expected.len());
let count = intersection(left_mut, right);
assert_eq!(&left_mut[..count], expected);
return;
}
let mut right_vec = Vec::from(right);
let right_mut = &mut right_vec[..];
let count = intersection_with_slop(left_mut, right_mut, slop);
assert_eq!(&left_mut[..count], expected);
}
@@ -291,6 +357,36 @@ mod tests {
test_intersection_sym(&[5, 7], &[1, 5, 10, 12], &[5]);
test_intersection_sym(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12]);
}
#[test]
fn test_slop() {
// The slop is not symetric. It does not allow for the phrase to be out of order.
test_intersection_aux(&[1], &[2], &[2], 1);
test_intersection_aux(&[1], &[3], &[], 1);
test_intersection_aux(&[1], &[3], &[3], 2);
test_intersection_aux(&[], &[2], &[], 100000);
test_intersection_aux(&[5, 7, 11], &[1, 5, 10, 12], &[5, 12], 1);
test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 1);
test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 10);
test_intersection_aux(&[1, 3, 5], &[2, 4, 6], &[2, 4, 6], 1);
test_intersection_aux(&[1, 3, 5], &[2, 4, 6], &[], 0);
}
fn test_merge(left: &[u32], right: &[u32], expected_left: &[u32], slop: u32) {
let mut left_vec = Vec::from(left);
let left_mut = &mut left_vec[..];
let mut right_vec = Vec::from(right);
let right_mut = &mut right_vec[..];
let count = intersection_with_slop(left_mut, right_mut, slop);
assert_eq!(&left_mut[..count], expected_left);
}
#[test]
fn test_merge_slop() {
test_merge(&[1, 2], &[1], &[1], 1);
test_merge(&[3], &[4], &[4], 2);
test_merge(&[3], &[4], &[4], 2);
test_merge(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 10);
}
}
#[cfg(all(test, feature = "unstable"))]

View File

@@ -12,6 +12,7 @@ pub struct PhraseWeight {
phrase_terms: Vec<(usize, Term)>,
similarity_weight: Bm25Weight,
scoring_enabled: bool,
slop: u32,
}
impl PhraseWeight {
@@ -21,10 +22,12 @@ impl PhraseWeight {
similarity_weight: Bm25Weight,
scoring_enabled: bool,
) -> PhraseWeight {
let slop = 0;
PhraseWeight {
phrase_terms,
similarity_weight,
scoring_enabled,
slop,
}
}
@@ -74,8 +77,13 @@ impl PhraseWeight {
similarity_weight,
fieldnorm_reader,
self.scoring_enabled,
self.slop,
)))
}
pub fn slop(&mut self, slop: u32) {
self.slop = slop;
}
}
impl Weight for PhraseWeight {