From 4c58b0086da5abe4d58abe4f081e0546930c78a2 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Sun, 7 May 2023 11:05:21 +0800 Subject: [PATCH] allow slop in both directions (#2020) * allow slop in both directions allow slop in both directions so "big wolf"~3 can also match "wolf big" This also fixes #1934, when the docsets were reordered by size and didn't match the terms. * remove count * add test for repeating tokens, unduplicate tests --- src/query/phrase_query/mod.rs | 108 ++++------- src/query/phrase_query/phrase_scorer.rs | 229 ++++++++++-------------- 2 files changed, 138 insertions(+), 199 deletions(-) diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index 4a0bd8298..1011041da 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -160,22 +160,7 @@ pub mod tests { #[test] pub fn test_phrase_score() -> crate::Result<()> { let index = create_index(&["a b c", "a b c a b"])?; - let schema = index.schema(); - let text_field = schema.get_field("text").unwrap(); - let searcher = index.reader()?.searcher(); - let test_query = |texts: Vec<&str>| { - let terms: Vec = texts - .iter() - .map(|text| Term::from_field_text(text_field, text)) - .collect(); - let phrase_query = PhraseQuery::new(terms); - searcher - .search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE) - .expect("search should succeed") - .scores() - .to_vec() - }; - let scores = test_query(vec!["a", "b"]); + let scores = test_query(0, &index, vec!["a", "b"]); assert_nearly_equals!(scores[0], 0.40618482); assert_nearly_equals!(scores[1], 0.46844664); Ok(()) @@ -185,48 +170,48 @@ pub mod tests { #[test] pub fn test_phrase_score_with_slop() -> crate::Result<()> { let index = create_index(&["a c b", "a b c a b"])?; - let schema = index.schema(); - let text_field = schema.get_field("text").unwrap(); - let searcher = index.reader().unwrap().searcher(); - let test_query = |texts: Vec<&str>| { - let terms: Vec = texts - .iter() - .map(|text| Term::from_field_text(text_field, text)) - .collect(); - let mut phrase_query = PhraseQuery::new(terms); - phrase_query.set_slop(1); - searcher - .search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE) - .expect("search should succeed") - .scores() - .to_vec() - }; - let scores = test_query(vec!["a", "b"]); + let scores = test_query(1, &index, vec!["a", "b"]); assert_nearly_equals!(scores[0], 0.40618482); assert_nearly_equals!(scores[1], 0.46844664); Ok(()) } + #[test] + pub fn test_phrase_score_with_slop_bug() -> crate::Result<()> { + let index = create_index(&["asdf asdf Captain Subject Wendy", "Captain"])?; + let scores = test_query(1, &index, vec!["captain", "wendy"]); + assert_eq!(scores.len(), 1); + Ok(()) + } + + fn test_query(slop: u32, index: &Index, texts: Vec<&str>) -> Vec { + let text_field = index.schema().get_field("text").unwrap(); + let searcher = index.reader().unwrap().searcher(); + let terms: Vec = texts + .iter() + .map(|text| Term::from_field_text(text_field, text)) + .collect(); + let mut phrase_query = PhraseQuery::new(terms); + phrase_query.set_slop(slop); + searcher + .search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE) + .expect("search should succeed") + .scores() + .to_vec() + } + + #[test] + pub fn test_phrase_score_with_slop_repeating() -> crate::Result<()> { + let index = create_index(&["wendy subject subject captain", "Captain"])?; + let scores = test_query(1, &index, vec!["wendy", "subject", "captain"]); + assert_eq!(scores.len(), 1); + Ok(()) + } + #[test] pub fn test_phrase_score_with_slop_size() -> crate::Result<()> { let index = create_index(&["a b e c", "a e e e c", "a e e e e c"])?; - let schema = index.schema(); - let text_field = schema.get_field("text").unwrap(); - let searcher = index.reader().unwrap().searcher(); - let test_query = |texts: Vec<&str>| { - let terms: Vec = texts - .iter() - .map(|text| Term::from_field_text(text_field, text)) - .collect(); - let mut phrase_query = PhraseQuery::new(terms); - phrase_query.set_slop(3); - searcher - .search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE) - .expect("search should succeed") - .scores() - .to_vec() - }; - let scores = test_query(vec!["a", "c"]); + let scores = test_query(3, &index, vec!["a", "c"]); assert_nearly_equals!(scores[0], 0.29086056); assert_nearly_equals!(scores[1], 0.26706287); Ok(()) @@ -237,31 +222,16 @@ pub mod tests { let index = create_index(&[ "a e b e c", "a e e e e e b e e e e c", - "a c b", + "a c b", // also matches "a c e b e", "a e c b", "a e b c", ])?; - let schema = index.schema(); - let text_field = schema.get_field("text").unwrap(); - let searcher = index.reader().unwrap().searcher(); - let test_query = |texts: Vec<&str>| { - let terms: Vec = texts - .iter() - .map(|text| Term::from_field_text(text_field, text)) - .collect(); - let mut phrase_query = PhraseQuery::new(terms); - phrase_query.set_slop(3); - searcher - .search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE) - .expect("search should succeed") - .scores() - .to_vec() - }; - let scores = test_query(vec!["a", "b", "c"]); + let scores = test_query(3, &index, vec!["a", "b", "c"]); // The first and last matches. assert_nearly_equals!(scores[0], 0.23091172); - assert_nearly_equals!(scores[1], 0.25024384); + assert_nearly_equals!(scores[1], 0.27310878); + assert_nearly_equals!(scores[3], 0.25024384); Ok(()) } diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 6a5c641ec..86386afc5 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -46,8 +46,8 @@ impl DocSet for PostingsWithOffset { pub struct PhraseScorer { intersection_docset: Intersection, PostingsWithOffset>, num_terms: usize, - left: Vec, - right: Vec, + left_positions: Vec, + right_positions: Vec, phrase_count: u32, fieldnorm_reader: FieldNormReader, similarity_weight_opt: Option, @@ -104,7 +104,8 @@ pub(crate) fn intersection_count(left: &[u32], right: &[u32]) -> usize { /// resulting array in left. /// /// Returns the length of the intersection -fn intersection(left: &mut [u32], right: &[u32]) -> usize { +#[inline] +fn intersection(left: &mut Vec, right: &[u32]) { let mut left_index = 0; let mut right_index = 0; let mut count = 0; @@ -128,42 +129,36 @@ fn intersection(left: &mut [u32], right: &[u32]) -> usize { } } } - count + left.truncate(count); } /// Intersect twos sorted arrays `left` and `right` and outputs the -/// resulting array in left. +/// resulting array in left_positions if update_left is true. /// -/// Condition for match is that the value stored in left is less than or equal to -/// the value in right and that the distance to the previous token is lte to the slop. +/// Condition for match is that the distance between left and right is less than or equal to `slop`. /// /// Returns the length of the intersection -fn intersection_with_slop(left: &mut [u32], right: &[u32], slop: u32) -> usize { +#[inline] +fn intersection_count_with_slop( + left_positions: &mut Vec, + right_positions: &[u32], + slop: u32, + update_left: bool, +) -> usize { let mut left_index = 0; let mut right_index = 0; let mut count = 0; - let left_len = left.len(); - let right_len = right.len(); + let left_len = left_positions.len(); + let right_len = right_positions.len(); while left_index < left_len && right_index < right_len { - let left_val = left[left_index]; - let right_val = right[right_index]; + let left_val = left_positions[left_index]; + let right_val = right_positions[right_index]; - // The three conditions are: - // left_val < right_slop -> left index increment. - // right_slop <= left_val <= right -> find the best match. - // left_val > right -> right index increment. - let right_slop = if right_val >= slop { - right_val - slop - } else { - 0 - }; - - if left_val < right_slop { - left_index += 1; - } else if right_slop <= left_val && left_val <= right_val { + let distance = left_val.abs_diff(right_val); + if distance <= slop { while left_index + 1 < left_len { // there could be a better match - let next_left_val = left[left_index + 1]; + let next_left_val = left_positions[left_index + 1]; if next_left_val > right_val { // the next value is outside the range, so current one is the best. break; @@ -171,72 +166,45 @@ fn intersection_with_slop(left: &mut [u32], right: &[u32], slop: u32) -> usize { // the next value is better. left_index += 1; } + // store the match in left. - left[count] = right_val; - count += 1; - left_index += 1; - right_index += 1; - } else if left_val > right_val { - right_index += 1; - } - } - count -} - -fn intersection_count_with_slop(left: &[u32], right: &[u32], slop: u32) -> usize { - let mut left_index = 0; - let mut right_index = 0; - let mut count = 0; - let left_len = left.len(); - let right_len = right.len(); - while left_index < left_len && right_index < right_len { - let left_val = left[left_index]; - let right_val = right[right_index]; - let right_slop = if right_val >= slop { - right_val - slop - } else { - 0 - }; - - if left_val < right_slop { - left_index += 1; - } else if right_slop <= left_val && left_val <= right_val { - while left_index + 1 < left_len { - let next_left_val = left[left_index + 1]; - if next_left_val > right_val { - break; - } - left_index += 1; + if update_left { + left_positions[count] = right_val; } count += 1; left_index += 1; right_index += 1; - } else if left_val > right_val { + } else if left_val < right_val { + left_index += 1; + } else { right_index += 1; } } + if update_left { + left_positions.truncate(count); + } + count } -fn intersection_exists_with_slop(left: &[u32], right: &[u32], slop: u32) -> bool { +fn intersection_exists_with_slop( + left_positions: &[u32], + right_positions: &[u32], + slop: u32, +) -> bool { let mut left_index = 0; let mut right_index = 0; - let left_len = left.len(); - let right_len = right.len(); + let left_len = left_positions.len(); + let right_len = right_positions.len(); while left_index < left_len && right_index < right_len { - let left_val = left[left_index]; - let right_val = right[right_index]; - let right_slop = if right_val >= slop { - right_val - slop - } else { - 0 - }; - - if left_val < right_slop { - left_index += 1; - } else if right_slop <= left_val && left_val <= right_val { + let left_val = left_positions[left_index]; + let right_val = right_positions[right_index]; + let distance = left_val.abs_diff(right_val); + if distance <= slop { return true; - } else if left_val > right_val { + } else if left_val < right_val { + left_index += 1; + } else { right_index += 1; } } @@ -261,20 +229,20 @@ impl PhraseScorer { } pub(crate) fn new_with_offset( - term_postings: Vec<(usize, TPostings)>, + term_postings_with_offset: Vec<(usize, TPostings)>, similarity_weight_opt: Option, fieldnorm_reader: FieldNormReader, slop: u32, offset: usize, ) -> PhraseScorer { - let max_offset = term_postings + let max_offset = term_postings_with_offset .iter() .map(|&(offset, _)| offset) .max() .unwrap_or(0) + offset; - let num_docsets = term_postings.len(); - let postings_with_offsets = term_postings + let num_docsets = term_postings_with_offset.len(); + let postings_with_offsets = term_postings_with_offset .into_iter() .map(|(offset, postings)| { PostingsWithOffset::new(postings, (max_offset - offset) as u32) @@ -283,8 +251,8 @@ impl PhraseScorer { let mut scorer = PhraseScorer { intersection_docset: Intersection::new(postings_with_offsets), num_terms: num_docsets, - left: Vec::with_capacity(100), - right: Vec::with_capacity(100), + left_positions: Vec::with_capacity(100), + right_positions: Vec::with_capacity(100), phrase_count: 0u32, similarity_weight_opt, fieldnorm_reader, @@ -301,8 +269,8 @@ impl PhraseScorer { } pub(crate) fn get_intersection(&mut self) -> &[u32] { - let len = intersection(&mut self.left, &self.right); - &self.left[..len] + intersection(&mut self.left_positions, &self.right_positions); + &self.left_positions } fn phrase_match(&mut self) -> bool { @@ -316,59 +284,61 @@ impl PhraseScorer { } fn phrase_exists(&mut self) -> bool { - let intersection_len = self.compute_phrase_match(); + self.compute_phrase_match(); if self.has_slop() { - return intersection_exists_with_slop( - &self.left[..intersection_len], - &self.right[..], + intersection_exists_with_slop( + &self.left_positions, + &self.right_positions[..], self.slop, - ); + ) + } else { + intersection_exists(&self.left_positions, &self.right_positions[..]) } - intersection_exists(&self.left[..intersection_len], &self.right[..]) } fn compute_phrase_count(&mut self) -> u32 { - let intersection_len = self.compute_phrase_match(); + self.compute_phrase_match(); if self.has_slop() { - return intersection_count_with_slop( - &self.left[..intersection_len], - &self.right[..], + intersection_count_with_slop( + &mut self.left_positions, + &self.right_positions[..], self.slop, - ) as u32; + false, + ) as u32 + } else { + intersection_count(&self.left_positions, &self.right_positions[..]) as u32 } - intersection_count(&self.left[..intersection_len], &self.right[..]) as u32 } - fn compute_phrase_match(&mut self) -> usize { + fn compute_phrase_match(&mut self) { { self.intersection_docset .docset_mut_specialized(0) - .positions(&mut self.left); + .positions(&mut self.left_positions); } - let mut intersection_len = self.left.len(); for i in 1..self.num_terms - 1 { { self.intersection_docset .docset_mut_specialized(i) - .positions(&mut self.right); + .positions(&mut self.right_positions); } - intersection_len = if self.has_slop() { - intersection_with_slop( - &mut self.left[..intersection_len], - &self.right[..], + if self.has_slop() { + intersection_count_with_slop( + &mut self.left_positions, + &self.right_positions[..], self.slop, - ) + true, + ); } else { - intersection(&mut self.left[..intersection_len], &self.right[..]) + intersection(&mut self.left_positions, &self.right_positions); }; - if intersection_len == 0 { - return 0; + if self.left_positions.is_empty() { + return; } } self.intersection_docset .docset_mut_specialized(self.num_terms - 1) - .positions(&mut self.right); - intersection_len + .positions(&mut self.right_positions); } fn has_slop(&self) -> bool { @@ -418,7 +388,7 @@ impl Scorer for PhraseScorer { #[cfg(test)] mod tests { - use super::{intersection, intersection_count, intersection_with_slop}; + use super::{intersection, intersection_count, intersection_count_with_slop}; fn test_intersection_sym(left: &[u32], right: &[u32], expected: &[u32]) { test_intersection_aux(left, right, expected, 0); @@ -427,18 +397,16 @@ mod tests { fn test_intersection_aux(left: &[u32], right: &[u32], expected: &[u32], slop: u32) { let mut left_vec = Vec::from(left); - let left_mut = &mut left_vec[..]; if slop == 0 { - let left_mut = &mut left_vec[..]; - assert_eq!(intersection_count(left_mut, right), expected.len()); - let count = intersection(left_mut, right); - assert_eq!(&left_mut[..count], expected); - return; + assert_eq!(intersection_count(&left_vec, right), expected.len()); + intersection(&mut left_vec, right); + assert_eq!(&left_vec, expected); + } else { + let mut right_vec = Vec::from(right); + let right_mut = &mut right_vec[..]; + intersection_count_with_slop(&mut left_vec, right_mut, slop, true); + assert_eq!(&left_vec, expected); } - let mut right_vec = Vec::from(right); - let right_mut = &mut right_vec[..]; - let count = intersection_with_slop(left_mut, right_mut, slop); - assert_eq!(&left_mut[..count], expected); } #[test] @@ -456,20 +424,19 @@ mod tests { test_intersection_aux(&[1], &[3], &[], 1); test_intersection_aux(&[1], &[3], &[3], 2); test_intersection_aux(&[], &[2], &[], 100000); - test_intersection_aux(&[5, 7, 11], &[1, 5, 10, 12], &[5, 12], 1); - test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 1); - test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 10); + test_intersection_aux(&[5, 7, 11], &[1, 5, 10, 12], &[5, 10], 1); + test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 8, 9, 12], 1); + test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 8, 9, 12], 10); test_intersection_aux(&[1, 3, 5], &[2, 4, 6], &[2, 4, 6], 1); test_intersection_aux(&[1, 3, 5], &[2, 4, 6], &[], 0); } fn test_merge(left: &[u32], right: &[u32], expected_left: &[u32], slop: u32) { let mut left_vec = Vec::from(left); - let left_mut = &mut left_vec[..]; let mut right_vec = Vec::from(right); let right_mut = &mut right_vec[..]; - let count = intersection_with_slop(left_mut, right_mut, slop); - assert_eq!(&left_mut[..count], expected_left); + intersection_count_with_slop(&mut left_vec, right_mut, slop, true); + assert_eq!(&left_vec, expected_left); } #[test] @@ -477,7 +444,7 @@ mod tests { test_merge(&[1, 2], &[1], &[1], 1); test_merge(&[3], &[4], &[4], 2); test_merge(&[3], &[4], &[4], 2); - test_merge(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 10); + test_merge(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 8, 9, 12], 10); } } @@ -490,8 +457,10 @@ mod bench { #[bench] fn bench_intersection_short(b: &mut Bencher) { + let mut left = Vec::new(); b.iter(|| { - let mut left = [1, 5, 10, 12]; + left.clear(); + left.extend_from_slice(&[1, 5, 10, 12]); let right = [5, 7]; intersection(&mut left, &right); });