Slop support for phrase queries (#1241)

Closes #1068
2026-01-09 10:32:55 +00:00 · 2022-03-07 06:29:18 +00:00
parent d31f045872
commit cedced5bb0
4 changed files with 255 additions and 60 deletions
--- a/src/query/phrase_query/mod.rs
+++ b/src/query/phrase_query/mod.rs
@@ -181,6 +181,90 @@ pub mod tests {
        Ok(())
    }

+    #[ignore]
+    #[test]
+    pub fn test_phrase_score_with_slop() -> crate::Result<()> {
+        let index = create_index(&["a c b", "a b c a b"])?;
+        let schema = index.schema();
+        let text_field = schema.get_field("text").unwrap();
+        let searcher = index.reader().unwrap().searcher();
+        let test_query = |texts: Vec<&str>| {
+            let terms: Vec<Term> = texts
+                .iter()
+                .map(|text| Term::from_field_text(text_field, text))
+                .collect();
+            let mut phrase_query = PhraseQuery::new(terms);
+            phrase_query.set_slop(1);
+            searcher
+                .search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
+                .expect("search should succeed")
+                .scores()
+                .to_vec()
+        };
+        let scores = test_query(vec!["a", "b"]);
+        assert_nearly_equals!(scores[0], 0.40618482);
+        assert_nearly_equals!(scores[1], 0.46844664);
+        Ok(())
+    }
+
+    #[test]
+    pub fn test_phrase_score_with_slop_size() -> crate::Result<()> {
+        let index = create_index(&["a b e c", "a e e e c", "a e e e e c"])?;
+        let schema = index.schema();
+        let text_field = schema.get_field("text").unwrap();
+        let searcher = index.reader().unwrap().searcher();
+        let test_query = |texts: Vec<&str>| {
+            let terms: Vec<Term> = texts
+                .iter()
+                .map(|text| Term::from_field_text(text_field, text))
+                .collect();
+            let mut phrase_query = PhraseQuery::new(terms);
+            phrase_query.set_slop(3);
+            searcher
+                .search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
+                .expect("search should succeed")
+                .scores()
+                .to_vec()
+        };
+        let scores = test_query(vec!["a", "c"]);
+        assert_nearly_equals!(scores[0], 0.29086056);
+        assert_nearly_equals!(scores[1], 0.26706287);
+        Ok(())
+    }
+
+    #[test]
+    pub fn test_phrase_score_with_slop_ordering() -> crate::Result<()> {
+        let index = create_index(&[
+            "a e b e c",
+            "a e e e e e b e e e e c",
+            "a c b",
+            "a c e b e",
+            "a e c b",
+            "a e b c",
+        ])?;
+        let schema = index.schema();
+        let text_field = schema.get_field("text").unwrap();
+        let searcher = index.reader().unwrap().searcher();
+        let test_query = |texts: Vec<&str>| {
+            let terms: Vec<Term> = texts
+                .iter()
+                .map(|text| Term::from_field_text(text_field, text))
+                .collect();
+            let mut phrase_query = PhraseQuery::new(terms);
+            phrase_query.set_slop(3);
+            searcher
+                .search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
+                .expect("search should succeed")
+                .scores()
+                .to_vec()
+        };
+        let scores = test_query(vec!["a", "b", "c"]);
+        // The first and last matches.
+        assert_nearly_equals!(scores[0], 0.23091172);
+        assert_nearly_equals!(scores[1], 0.25024384);
+        Ok(())
+    }
+
    #[test] // motivated by #234
    pub fn test_phrase_query_docfreq_order() -> crate::Result<()> {
        let mut schema_builder = Schema::builder();
--- a/src/query/phrase_query/phrase_query.rs
+++ b/src/query/phrase_query/phrase_query.rs
@@ -23,6 +23,7 @@ use crate::schema::{Field, IndexRecordOption, Term};
 pub struct PhraseQuery {
    field: Field,
    phrase_terms: Vec<(usize, Term)>,
+    slop: u32,
 }

 impl PhraseQuery {
@@ -53,9 +54,15 @@ impl PhraseQuery {
        PhraseQuery {
            field,
            phrase_terms: terms,
+            slop: 0,
        }
    }

+    /// Slop allowed for the phrase.
+    pub fn set_slop(&mut self, value: u32) {
+        self.slop = value;
+    }
+
    /// The `Field` this `PhraseQuery` is targeting.
    pub fn field(&self) -> Field {
        self.field
@@ -94,11 +101,11 @@ impl PhraseQuery {
        }
        let terms = self.phrase_terms();
        let bm25_weight = Bm25Weight::for_terms(searcher, &terms)?;
-        Ok(PhraseWeight::new(
-            self.phrase_terms.clone(),
-            bm25_weight,
-            scoring_enabled,
-        ))
+        let mut weight = PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, scoring_enabled);
+        if self.slop > 0 {
+            weight.slop(self.slop);
+        }
+        Ok(weight)
    }
 }

--- a/src/query/phrase_query/phrase_scorer.rs
+++ b/src/query/phrase_query/phrase_scorer.rs
@@ -52,24 +52,25 @@ pub struct PhraseScorer<TPostings: Postings> {
    fieldnorm_reader: FieldNormReader,
    similarity_weight: Bm25Weight,
    scoring_enabled: bool,
+    slop: u32,
 }

 /// Returns true if and only if the two sorted arrays contain a common element
 fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
-    let mut left_i = 0;
-    let mut right_i = 0;
-    while left_i < left.len() && right_i < right.len() {
-        let left_val = left[left_i];
-        let right_val = right[right_i];
+    let mut left_index = 0;
+    let mut right_index = 0;
+    while left_index < left.len() && right_index < right.len() {
+        let left_val = left[left_index];
+        let right_val = right[right_index];
        match left_val.cmp(&right_val) {
            Ordering::Less => {
-                left_i += 1;
+                left_index += 1;
            }
            Ordering::Equal => {
                return true;
            }
            Ordering::Greater => {
-                right_i += 1;
+                right_index += 1;
            }
        }
    }
@@ -77,23 +78,23 @@ fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
 }

 fn intersection_count(left: &[u32], right: &[u32]) -> usize {
-    let mut left_i = 0;
-    let mut right_i = 0;
+    let mut left_index = 0;
+    let mut right_index = 0;
    let mut count = 0;
-    while left_i < left.len() && right_i < right.len() {
-        let left_val = left[left_i];
-        let right_val = right[right_i];
+    while left_index < left.len() && right_index < right.len() {
+        let left_val = left[left_index];
+        let right_val = right[right_index];
        match left_val.cmp(&right_val) {
            Ordering::Less => {
-                left_i += 1;
+                left_index += 1;
            }
            Ordering::Equal => {
                count += 1;
-                left_i += 1;
-                right_i += 1;
+                left_index += 1;
+                right_index += 1;
            }
            Ordering::Greater => {
-                right_i += 1;
+                right_index += 1;
            }
        }
    }
@@ -105,38 +106,91 @@ fn intersection_count(left: &[u32], right: &[u32]) -> usize {
 ///
 /// Returns the length of the intersection
 fn intersection(left: &mut [u32], right: &[u32]) -> usize {
-    let mut left_i = 0;
-    let mut right_i = 0;
+    let mut left_index = 0;
+    let mut right_index = 0;
    let mut count = 0;
    let left_len = left.len();
    let right_len = right.len();
-    while left_i < left_len && right_i < right_len {
-        let left_val = left[left_i];
-        let right_val = right[right_i];
+    while left_index < left_len && right_index < right_len {
+        let left_val = left[left_index];
+        let right_val = right[right_index];
        match left_val.cmp(&right_val) {
            Ordering::Less => {
-                left_i += 1;
+                left_index += 1;
            }
            Ordering::Equal => {
                left[count] = left_val;
                count += 1;
-                left_i += 1;
-                right_i += 1;
+                left_index += 1;
+                right_index += 1;
            }
            Ordering::Greater => {
-                right_i += 1;
+                right_index += 1;
            }
        }
    }
    count
 }

+/// Intersect twos sorted arrays `left` and `right` and outputs the
+/// resulting array in left.
+///
+/// Condition for match is that the value stored in left is less than or equal to
+/// the value in right and that the distance to the previous token is lte to the slop.
+///
+/// Returns the length of the intersection
+fn intersection_with_slop(left: &mut [u32], right: &[u32], slop: u32) -> usize {
+    let mut left_index = 0;
+    let mut right_index = 0;
+    let mut count = 0;
+    let left_len = left.len();
+    let right_len = right.len();
+    while left_index < left_len && right_index < right_len {
+        let left_val = left[left_index];
+        let right_val = right[right_index];
+
+        // The three conditions are:
+        // left_val < right_slop -> left index increment.
+        // right_slop <= left_val <= right -> find the best match.
+        // left_val > right -> right index increment.
+        let right_slop = if right_val >= slop {
+            right_val - slop
+        } else {
+            0
+        };
+
+        if left_val < right_slop {
+            left_index += 1;
+        } else if right_slop <= left_val && left_val <= right_val {
+            while left_index + 1 < left_len {
+                // there could be a better match
+                let next_left_val = left[left_index + 1];
+                if next_left_val > right_val {
+                    // the next value is outside the range, so current one is the best.
+                    break;
+                }
+                // the next value is better.
+                left_index += 1;
+            }
+            // store the match in left.
+            left[count] = right_val;
+            count += 1;
+            left_index += 1;
+            right_index += 1;
+        } else if left_val > right_val {
+            right_index += 1;
+        }
+    }
+    count
+}
+
 impl<TPostings: Postings> PhraseScorer<TPostings> {
    pub fn new(
        term_postings: Vec<(usize, TPostings)>,
        similarity_weight: Bm25Weight,
        fieldnorm_reader: FieldNormReader,
        scoring_enabled: bool,
+        slop: u32,
    ) -> PhraseScorer<TPostings> {
        let max_offset = term_postings
            .iter()
@@ -159,6 +213,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
            similarity_weight,
            fieldnorm_reader,
            scoring_enabled,
+            slop,
        };
        if scorer.doc() != TERMINATED && !scorer.phrase_match() {
            scorer.advance();
@@ -181,51 +236,54 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
    }

    fn phrase_exists(&mut self) -> bool {
-        self.intersection_docset
-            .docset_mut_specialized(0)
-            .positions(&mut self.left);
-        let mut intersection_len = self.left.len();
-        for i in 1..self.num_terms - 1 {
-            {
-                self.intersection_docset
-                    .docset_mut_specialized(i)
-                    .positions(&mut self.right);
-            }
-            intersection_len = intersection(&mut self.left[..intersection_len], &self.right[..]);
-            if intersection_len == 0 {
-                return false;
-            }
-        }
-
-        self.intersection_docset
-            .docset_mut_specialized(self.num_terms - 1)
-            .positions(&mut self.right);
+        let intersection_len = self.compute_phrase_match();
        intersection_exists(&self.left[..intersection_len], &self.right[..])
    }

    fn compute_phrase_count(&mut self) -> u32 {
+        let intersection_len = self.compute_phrase_match();
+        intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
+    }
+
+    fn compute_phrase_match(&mut self) -> usize {
        {
            self.intersection_docset
                .docset_mut_specialized(0)
                .positions(&mut self.left);
        }
        let mut intersection_len = self.left.len();
-        for i in 1..self.num_terms - 1 {
+        let end_term = if self.has_slop() {
+            self.num_terms
+        } else {
+            self.num_terms - 1
+        };
+        for i in 1..end_term {
            {
                self.intersection_docset
                    .docset_mut_specialized(i)
                    .positions(&mut self.right);
            }
-            intersection_len = intersection(&mut self.left[..intersection_len], &self.right[..]);
+            intersection_len = if self.has_slop() {
+                intersection_with_slop(
+                    &mut self.left[..intersection_len],
+                    &self.right[..],
+                    self.slop,
+                )
+            } else {
+                intersection(&mut self.left[..intersection_len], &self.right[..])
+            };
            if intersection_len == 0 {
-                return 0u32;
+                return 0;
            }
        }
-
        self.intersection_docset
            .docset_mut_specialized(self.num_terms - 1)
            .positions(&mut self.right);
-        intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
+        intersection_len
+    }
+
+    fn has_slop(&self) -> bool {
+        self.slop > 0
    }
 }

@@ -268,18 +326,26 @@ impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {

 #[cfg(test)]
 mod tests {
-    use super::{intersection, intersection_count};
+    use super::{intersection, intersection_count, intersection_with_slop};

    fn test_intersection_sym(left: &[u32], right: &[u32], expected: &[u32]) {
-        test_intersection_aux(left, right, expected);
-        test_intersection_aux(right, left, expected);
+        test_intersection_aux(left, right, expected, 0);
+        test_intersection_aux(right, left, expected, 0);
    }

-    fn test_intersection_aux(left: &[u32], right: &[u32], expected: &[u32]) {
+    fn test_intersection_aux(left: &[u32], right: &[u32], expected: &[u32], slop: u32) {
        let mut left_vec = Vec::from(left);
        let left_mut = &mut left_vec[..];
-        assert_eq!(intersection_count(left_mut, right), expected.len());
-        let count = intersection(left_mut, right);
+        if slop == 0 {
+            let left_mut = &mut left_vec[..];
+            assert_eq!(intersection_count(left_mut, right), expected.len());
+            let count = intersection(left_mut, right);
+            assert_eq!(&left_mut[..count], expected);
+            return;
+        }
+        let mut right_vec = Vec::from(right);
+        let right_mut = &mut right_vec[..];
+        let count = intersection_with_slop(left_mut, right_mut, slop);
        assert_eq!(&left_mut[..count], expected);
    }

@@ -291,6 +357,36 @@ mod tests {
        test_intersection_sym(&[5, 7], &[1, 5, 10, 12], &[5]);
        test_intersection_sym(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12]);
    }
+    #[test]
+    fn test_slop() {
+        // The slop is not symetric. It does not allow for the phrase to be out of order.
+        test_intersection_aux(&[1], &[2], &[2], 1);
+        test_intersection_aux(&[1], &[3], &[], 1);
+        test_intersection_aux(&[1], &[3], &[3], 2);
+        test_intersection_aux(&[], &[2], &[], 100000);
+        test_intersection_aux(&[5, 7, 11], &[1, 5, 10, 12], &[5, 12], 1);
+        test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 1);
+        test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 10);
+        test_intersection_aux(&[1, 3, 5], &[2, 4, 6], &[2, 4, 6], 1);
+        test_intersection_aux(&[1, 3, 5], &[2, 4, 6], &[], 0);
+    }
+
+    fn test_merge(left: &[u32], right: &[u32], expected_left: &[u32], slop: u32) {
+        let mut left_vec = Vec::from(left);
+        let left_mut = &mut left_vec[..];
+        let mut right_vec = Vec::from(right);
+        let right_mut = &mut right_vec[..];
+        let count = intersection_with_slop(left_mut, right_mut, slop);
+        assert_eq!(&left_mut[..count], expected_left);
+    }
+
+    #[test]
+    fn test_merge_slop() {
+        test_merge(&[1, 2], &[1], &[1], 1);
+        test_merge(&[3], &[4], &[4], 2);
+        test_merge(&[3], &[4], &[4], 2);
+        test_merge(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 10);
+    }
 }

 #[cfg(all(test, feature = "unstable"))]
--- a/src/query/phrase_query/phrase_weight.rs
+++ b/src/query/phrase_query/phrase_weight.rs
@@ -12,6 +12,7 @@ pub struct PhraseWeight {
    phrase_terms: Vec<(usize, Term)>,
    similarity_weight: Bm25Weight,
    scoring_enabled: bool,
+    slop: u32,
 }

 impl PhraseWeight {
@@ -21,10 +22,12 @@ impl PhraseWeight {
        similarity_weight: Bm25Weight,
        scoring_enabled: bool,
    ) -> PhraseWeight {
+        let slop = 0;
        PhraseWeight {
            phrase_terms,
            similarity_weight,
            scoring_enabled,
+            slop,
        }
    }

@@ -74,8 +77,13 @@ impl PhraseWeight {
            similarity_weight,
            fieldnorm_reader,
            self.scoring_enabled,
+            self.slop,
        )))
    }
+
+    pub fn slop(&mut self, slop: u32) {
+        self.slop = slop;
+    }
 }

 impl Weight for PhraseWeight {