Fixed bench

Cleaned up variable names
Made the implementation respect ordering
2025-12-27 20:42:54 +00:00 · 2021-07-07 00:32:01 +01:00 · 2021-07-07 00:25:42 +01:00 · 2021-07-06 23:58:15 +01:00 · 2021-07-06 20:50:41 +01:00 · 2021-07-05 23:30:34 +01:00
4 changed files with 290 additions and 32 deletions
--- a/src/query/phrase_query/mod.rs
+++ b/src/query/phrase_query/mod.rs
@@ -179,6 +179,87 @@ pub mod tests {
        assert_nearly_equals!(scores[1], 0.46844664);
    }

+    #[ignore]
+    #[test]
+    pub fn test_phrase_score_with_slop() {
+        let index = create_index(&["a c b", "a b c a b"]);
+        let schema = index.schema();
+        let text_field = schema.get_field("text").unwrap();
+        let searcher = index.reader().unwrap().searcher();
+        let test_query = |texts: Vec<&str>| {
+            let terms: Vec<Term> = texts
+                .iter()
+                .map(|text| Term::from_field_text(text_field, text))
+                .collect();
+            let mut phrase_query = PhraseQuery::new(terms);
+            phrase_query.slop(1);
+            searcher
+                .search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
+                .expect("search should succeed")
+                .scores()
+                .to_vec()
+        };
+        let scores = test_query(vec!["a", "b"]);
+        assert_nearly_equals!(scores[0], 0.40618482);
+        assert_nearly_equals!(scores[1], 0.46844664);
+    }
+
+    #[test]
+    pub fn test_phrase_score_with_slop_size() {
+        let index = create_index(&["a b e c", "a e e e c", "a e e e e c"]);
+        let schema = index.schema();
+        let text_field = schema.get_field("text").unwrap();
+        let searcher = index.reader().unwrap().searcher();
+        let test_query = |texts: Vec<&str>| {
+            let terms: Vec<Term> = texts
+                .iter()
+                .map(|text| Term::from_field_text(text_field, text))
+                .collect();
+            let mut phrase_query = PhraseQuery::new(terms);
+            phrase_query.slop(3);
+            searcher
+                .search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
+                .expect("search should succeed")
+                .scores()
+                .to_vec()
+        };
+        let scores = test_query(vec!["a", "c"]);
+        assert_nearly_equals!(scores[0], 0.29086056);
+        assert_nearly_equals!(scores[1], 0.26706287);
+    }
+
+    #[test]
+    pub fn test_phrase_score_with_slop_ordering() {
+        let index = create_index(&[
+            "a e b e c",
+            "a e e e e e b e e e e c",
+            "a c b",
+            "a c e b e",
+            "a e c b",
+            "a e b c",
+        ]);
+        let schema = index.schema();
+        let text_field = schema.get_field("text").unwrap();
+        let searcher = index.reader().unwrap().searcher();
+        let test_query = |texts: Vec<&str>| {
+            let terms: Vec<Term> = texts
+                .iter()
+                .map(|text| Term::from_field_text(text_field, text))
+                .collect();
+            let mut phrase_query = PhraseQuery::new(terms);
+            phrase_query.slop(3);
+            searcher
+                .search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
+                .expect("search should succeed")
+                .scores()
+                .to_vec()
+        };
+        let scores = test_query(vec!["a", "b", "c"]);
+        // The first and last matches.
+        assert_nearly_equals!(scores[0], 0.23091172);
+        assert_nearly_equals!(scores[1], 0.25024384);
+    }
+
    #[test] // motivated by #234
    pub fn test_phrase_query_docfreq_order() {
        let mut schema_builder = Schema::builder();
--- a/src/query/phrase_query/phrase_query.rs
+++ b/src/query/phrase_query/phrase_query.rs
@@ -26,6 +26,7 @@ use crate::schema::{Field, Term};
 pub struct PhraseQuery {
    field: Field,
    phrase_terms: Vec<(usize, Term)>,
+    slop: u32,
 }

 impl PhraseQuery {
@@ -56,9 +57,15 @@ impl PhraseQuery {
        PhraseQuery {
            field,
            phrase_terms: terms,
+            slop: 0,
        }
    }

+    /// Slop allowed for the phrase.
+    pub fn slop(&mut self, value: u32) {
+        self.slop = value;
+    }
+
    /// The `Field` this `PhraseQuery` is targeting.
    pub fn field(&self) -> Field {
        self.field
@@ -97,11 +104,11 @@ impl PhraseQuery {
        }
        let terms = self.phrase_terms();
        let bm25_weight = Bm25Weight::for_terms(searcher, &terms)?;
-        Ok(PhraseWeight::new(
-            self.phrase_terms.clone(),
-            bm25_weight,
-            scoring_enabled,
-        ))
+        let mut weight = PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, scoring_enabled);
+        if self.slop > 0 {
+            weight.slop(self.slop);
+        }
+        Ok(weight)
    }
 }

--- a/src/query/phrase_query/phrase_scorer.rs
+++ b/src/query/phrase_query/phrase_scorer.rs
@@ -51,6 +51,7 @@ pub struct PhraseScorer<TPostings: Postings> {
    fieldnorm_reader: FieldNormReader,
    similarity_weight: Bm25Weight,
    scoring_enabled: bool,
+    slop: u32,
 }

 /// Returns true iff the two sorted array contain a common element
@@ -130,12 +131,82 @@ fn intersection(left: &mut [u32], right: &[u32]) -> usize {
    count
 }

+/// Intersect twos sorted arrays `left` and `right` and outputs the
+/// resulting array in left.
+///
+/// Condition for match is that the value stored in left is less than the value in right,
+/// and that the difference between the value in left_begin is within
+///
+/// Returns the length of the intersection
+fn intersection_with_distance(
+    left: &mut [u32],
+    left_begin: &mut [u32],
+    right: &mut [u32],
+    max_distance_to_begin: u32,
+) -> usize {
+    // TODO: Improve variable names?
+    let mut left_i = 0;
+    let mut right_i = 0;
+    let mut count = 0;
+    let left_len = left.len();
+    let right_len = right.len();
+    // Is the current last value guaranteed to be the final value.
+    let mut is_temporary = false;
+    while left_i < left_len && right_i < right_len {
+        let left_val = left[left_i];
+        let right_val = right[right_i];
+        match left_val.cmp(&right_val) {
+            Ordering::Less => {
+                if right_val - left_begin[left_i] <= max_distance_to_begin {
+                    if is_temporary {
+                        // If the value was temporary we have found a closer match.
+                        count -= 1;
+                    };
+                    left[count] = left_val;
+                    left_begin[count] = left_begin[left_i];
+                    right[count] = right_val;
+                    count += 1;
+                    left_i += 1;
+                    // Still possible to find a closer match.
+                    is_temporary = true;
+                } else {
+                    left_i += 1;
+                }
+            }
+            Ordering::Equal => {
+                if is_temporary {
+                    // If the value was temporary we have found an.
+                    count -= 1;
+                    is_temporary = false;
+                }
+                left[count] = left_val;
+                left_begin[count] = left_begin[left_i];
+                right[count] = right_val;
+                count += 1;
+                left_i += 1;
+                right_i += 1;
+            }
+            Ordering::Greater => {
+                right_i += 1;
+                // Given the constraint that left cannot be greater than right we know that the value in left is
+                // final.
+                is_temporary = false;
+            }
+        }
+    }
+    for i in 0..count {
+        left[i] = right[i];
+    }
+    count
+}
+
 impl<TPostings: Postings> PhraseScorer<TPostings> {
    pub fn new(
        term_postings: Vec<(usize, TPostings)>,
        similarity_weight: Bm25Weight,
        fieldnorm_reader: FieldNormReader,
        scoring_enabled: bool,
+        slop: u32,
    ) -> PhraseScorer<TPostings> {
        let max_offset = term_postings
            .iter()
@@ -158,6 +229,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
            similarity_weight,
            fieldnorm_reader,
            scoring_enabled,
+            slop,
        };
        if scorer.doc() != TERMINATED && !scorer.phrase_match() {
            scorer.advance();
@@ -170,6 +242,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
    }

    fn phrase_match(&mut self) -> bool {
+        // Need to add support for slop in phrase_count and phrase_exists.
        if self.scoring_enabled {
            let count = self.compute_phrase_count();
            self.phrase_count = count;
@@ -180,29 +253,25 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
    }

    fn phrase_exists(&mut self) -> bool {
-        self.intersection_docset
-            .docset_mut_specialized(0)
-            .positions(&mut self.left);
-        let mut intersection_len = self.left.len();
-        for i in 1..self.num_terms - 1 {
-            {
-                self.intersection_docset
-                    .docset_mut_specialized(i)
-                    .positions(&mut self.right);
-            }
-            intersection_len = intersection(&mut self.left[..intersection_len], &self.right[..]);
-            if intersection_len == 0 {
-                return false;
-            }
-        }
-
-        self.intersection_docset
-            .docset_mut_specialized(self.num_terms - 1)
-            .positions(&mut self.right);
+        let intersection_len = if self.has_slop() {
+            self.compute_match_with_slop()
+        } else {
+            self.compute_match()
+        };
        intersection_exists(&self.left[..intersection_len], &self.right[..])
    }

    fn compute_phrase_count(&mut self) -> u32 {
+        let intersection_len = if self.has_slop() {
+            self.compute_match_with_slop()
+        } else {
+            self.compute_match()
+        };
+        intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
+    }
+
+    /// Computes match without slop.
+    fn compute_match(&mut self) -> usize {
        {
            self.intersection_docset
                .docset_mut_specialized(0)
@@ -217,14 +286,52 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
            }
            intersection_len = intersection(&mut self.left[..intersection_len], &self.right[..]);
            if intersection_len == 0 {
-                return 0u32;
+                return 0;
            }
        }

        self.intersection_docset
            .docset_mut_specialized(self.num_terms - 1)
            .positions(&mut self.right);
-        intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
+        intersection_len
+    }
+
+    // Computes match with slop.
+    fn compute_match_with_slop(&mut self) -> usize {
+        {
+            self.intersection_docset
+                .docset_mut_specialized(0)
+                .positions(&mut self.left);
+        }
+        let mut intersection_len = self.left.len();
+        // We'll increment the values to be equal to the next match in the right array to achieve ordered slop.
+        let mut left_begin_vec = self.left.clone();
+        let left_begin = &mut left_begin_vec[..];
+        for i in 1..self.num_terms {
+            {
+                self.intersection_docset
+                    .docset_mut_specialized(i)
+                    .positions(&mut self.right);
+            }
+            intersection_len = intersection_with_distance(
+                &mut self.left[..intersection_len],
+                &mut left_begin[..intersection_len],
+                &mut self.right[..],
+                self.slop,
+            );
+            // Update the left to be equal to the right. Merge the initial left.
+            if intersection_len == 0 {
+                return 0;
+            }
+        }
+        self.intersection_docset
+            .docset_mut_specialized(self.num_terms - 1)
+            .positions(&mut self.right);
+        intersection_len
+    }
+
+    fn has_slop(&self) -> bool {
+        self.slop > 0
    }
 }

@@ -267,18 +374,28 @@ impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {

 #[cfg(test)]
 mod tests {
-    use super::{intersection, intersection_count};
+    use super::{intersection, intersection_count, intersection_with_distance};

    fn test_intersection_sym(left: &[u32], right: &[u32], expected: &[u32]) {
-        test_intersection_aux(left, right, expected);
-        test_intersection_aux(right, left, expected);
+        test_intersection_aux(left, right, expected, 0);
+        test_intersection_aux(right, left, expected, 0);
    }

-    fn test_intersection_aux(left: &[u32], right: &[u32], expected: &[u32]) {
+    fn test_intersection_aux(left: &[u32], right: &[u32], expected: &[u32], slop: u32) {
        let mut left_vec = Vec::from(left);
        let left_mut = &mut left_vec[..];
-        assert_eq!(intersection_count(left_mut, right), expected.len());
-        let count = intersection(left_mut, right);
+        if slop == 0 {
+            let left_mut = &mut left_vec[..];
+            assert_eq!(intersection_count(left_mut, right), expected.len());
+            let count = intersection(left_mut, right);
+            assert_eq!(&left_mut[..count], expected);
+            return;
+        }
+        let mut right_vec = Vec::from(right);
+        let right_mut = &mut right_vec[..];
+        let mut left_begin_vec = Vec::from(left);
+        let left_begin_mut = &mut left_begin_vec[..];
+        let count = intersection_with_distance(left_mut, left_begin_mut, right_mut, slop);
        assert_eq!(&left_mut[..count], expected);
    }

@@ -290,6 +407,51 @@ mod tests {
        test_intersection_sym(&[5, 7], &[1, 5, 10, 12], &[5]);
        test_intersection_sym(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12]);
    }
+    #[test]
+    fn test_slop() {
+        // The slop is not symetric. It does not allow for the phrase to be out of order.
+        test_intersection_aux(&[1], &[2], &[2], 1);
+        test_intersection_aux(&[1], &[3], &[], 1);
+        test_intersection_aux(&[1], &[3], &[3], 2);
+        test_intersection_aux(&[], &[2], &[], 100000);
+        test_intersection_aux(&[5, 7, 11], &[1, 5, 10, 12], &[5, 12], 1);
+        test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 1);
+        test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 10);
+    }
+
+    fn test_merge(
+        left: &[u32],
+        right: &[u32],
+        left_begin: &[u32],
+        expected_left: &[u32],
+        expected_left_begin: &[u32],
+        slop: u32,
+    ) {
+        let mut left_vec = Vec::from(left);
+        let left_mut = &mut left_vec[..];
+        let mut right_vec = Vec::from(right);
+        let right_mut = &mut right_vec[..];
+        let mut left_begin_vec = Vec::from(left_begin);
+        let left_begin_mut = &mut left_begin_vec[..];
+        let count = intersection_with_distance(left_mut, left_begin_mut, right_mut, slop);
+        assert_eq!(&left_mut[..count], expected_left);
+        assert_eq!(&left_begin_mut[..count], expected_left_begin);
+    }
+
+    #[test]
+    fn test_merge_slop() {
+        test_merge(&[1, 2], &[1], &[0, 1], &[1], &[0], 1);
+        test_merge(&[3], &[4], &[2], &[4], &[2], 2);
+        test_merge(&[3], &[4], &[2], &[4], &[2], 2);
+        test_merge(
+            &[1, 5, 6, 9, 10, 12],
+            &[6, 8, 9, 12],
+            &[0, 1, 2, 3, 4, 5],
+            &[6, 9, 12],
+            &[2, 3, 5],
+            10,
+        );
+    }
 }

 #[cfg(all(test, feature = "unstable"))]
--- a/src/query/phrase_query/phrase_weight.rs
+++ b/src/query/phrase_query/phrase_weight.rs
@@ -16,6 +16,7 @@ pub struct PhraseWeight {
    phrase_terms: Vec<(usize, Term)>,
    similarity_weight: Bm25Weight,
    scoring_enabled: bool,
+    slop: u32,
 }

 impl PhraseWeight {
@@ -25,10 +26,12 @@ impl PhraseWeight {
        similarity_weight: Bm25Weight,
        scoring_enabled: bool,
    ) -> PhraseWeight {
+        let slop = 0;
        PhraseWeight {
            phrase_terms,
            similarity_weight,
            scoring_enabled,
+            slop,
        }
    }

@@ -77,8 +80,13 @@ impl PhraseWeight {
            similarity_weight,
            fieldnorm_reader,
            self.scoring_enabled,
+            self.slop,
        )))
    }
+
+    pub fn slop(&mut self, slop: u32) {
+        self.slop = slop;
+    }
 }

 impl Weight for PhraseWeight {
Author	SHA1	Message	Date
Halvor Bø	d09720c989	Fixed bench	2021-07-07 00:32:01 +01:00
Halvor Bø	d9bcf6e26b	Cleaned up variable names	2021-07-07 00:25:42 +01:00
Halvor Bø	7ba0d0df1c	Made the implementation respect ordering	2021-07-06 23:58:15 +01:00
Halvor Bø	f1f922ea1b	MProof of concept implementation of slop	2021-07-06 20:50:41 +01:00
Halvor Bø	c35343d361	Added todo	2021-07-05 23:30:34 +01:00
Halvor Bø	7fe421ae2a	Initial	2021-07-05 23:04:09 +01:00