low hanging fruit in optimization

2025-12-28 04:52:55 +00:00 · 2024-06-11 15:53:22 +09:00
4 changed files with 77 additions and 20 deletions
--- a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs
+++ b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs
@@ -2,7 +2,7 @@ use crate::docset::{DocSet, TERMINATED};
 use crate::fieldnorm::FieldNormReader;
 use crate::postings::Postings;
 use crate::query::bm25::Bm25Weight;
-use crate::query::phrase_query::{intersection_count, PhraseScorer};
+use crate::query::phrase_query::{intersection_count, intersection_exists, PhraseScorer};
 use crate::query::Scorer;
 use crate::{DocId, Score};

@@ -92,14 +92,17 @@ impl<TPostings: Postings> Scorer for PhraseKind<TPostings> {
    }
 }

-pub struct PhrasePrefixScorer<TPostings: Postings> {
+pub struct PhrasePrefixScorer<TPostings: Postings, const SCORING_ENABLED: bool> {
    phrase_scorer: PhraseKind<TPostings>,
    suffixes: Vec<TPostings>,
    suffix_offset: u32,
    phrase_count: u32,
+    suffix_position_buffer: Vec<u32>,
 }

-impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
+impl<TPostings: Postings, const SCORING_ENABLED: bool>
+    PhrasePrefixScorer<TPostings, SCORING_ENABLED>
+{
    // If similarity_weight is None, then scoring is disabled.
    pub fn new(
        mut term_postings: Vec<(usize, TPostings)>,
@@ -107,7 +110,7 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
        fieldnorm_reader: FieldNormReader,
        suffixes: Vec<TPostings>,
        suffix_pos: usize,
-    ) -> PhrasePrefixScorer<TPostings> {
+    ) -> PhrasePrefixScorer<TPostings, SCORING_ENABLED> {
        // correct indices so we can merge with our suffix term the PhraseScorer doesn't know about
        let max_offset = term_postings
            .iter()
@@ -140,6 +143,7 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
            suffixes,
            suffix_offset: (max_offset - suffix_pos) as u32,
            phrase_count: 0,
+            suffix_position_buffer: Vec::with_capacity(100),
        };
        if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
            phrase_prefix_scorer.advance();
@@ -153,7 +157,6 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {

    fn matches_prefix(&mut self) -> bool {
        let mut count = 0;
-        let mut positions = Vec::new();
        let current_doc = self.doc();
        let pos_matching = self.phrase_scorer.get_intersection();
        for suffix in &mut self.suffixes {
@@ -162,16 +165,27 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
            }
            let doc = suffix.seek(current_doc);
            if doc == current_doc {
-                suffix.positions_with_offset(self.suffix_offset, &mut positions);
-                count += intersection_count(pos_matching, &positions);
+                suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer);
+                if SCORING_ENABLED {
+                    count += intersection_count(pos_matching, &self.suffix_position_buffer);
+                } else {
+                    if intersection_exists(pos_matching, &self.suffix_position_buffer) {
+                        return true;
+                    }
+                }
            }
        }
+        if !SCORING_ENABLED {
+            return false;
+        }
        self.phrase_count = count as u32;
        count != 0
    }
 }

-impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
+impl<TPostings: Postings, const SCORING_ENABLED: bool> DocSet
+    for PhrasePrefixScorer<TPostings, SCORING_ENABLED>
+{
    fn advance(&mut self) -> DocId {
        loop {
            let doc = self.phrase_scorer.advance();
@@ -198,9 +212,15 @@ impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
    }
 }

-impl<TPostings: Postings> Scorer for PhrasePrefixScorer<TPostings> {
+impl<TPostings: Postings, const SCORING_ENABLED: bool> Scorer
+    for PhrasePrefixScorer<TPostings, SCORING_ENABLED>
+{
    fn score(&mut self) -> Score {
+        if SCORING_ENABLED {
+            self.phrase_scorer.score()
+        } else {
+            1.0f32
+        }
        // TODO modify score??
-        self.phrase_scorer.score()
    }
 }
--- a/src/query/phrase_prefix_query/phrase_prefix_weight.rs
+++ b/src/query/phrase_prefix_query/phrase_prefix_weight.rs
@@ -42,11 +42,11 @@ impl PhrasePrefixWeight {
        Ok(FieldNormReader::constant(reader.max_doc(), 1))
    }

-    pub(crate) fn phrase_scorer(
+    pub(crate) fn phrase_prefix_scorer<const SCORING_ENABLED: bool>(
        &self,
        reader: &SegmentReader,
        boost: Score,
-    ) -> crate::Result<Option<PhrasePrefixScorer<SegmentPostings>>> {
+    ) -> crate::Result<Option<PhrasePrefixScorer<SegmentPostings, SCORING_ENABLED>>> {
        let similarity_weight_opt = self
            .similarity_weight_opt
            .as_ref()
@@ -128,15 +128,20 @@ impl PhrasePrefixWeight {

 impl Weight for PhrasePrefixWeight {
    fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
-        if let Some(scorer) = self.phrase_scorer(reader, boost)? {
-            Ok(Box::new(scorer))
+        if self.similarity_weight_opt.is_some() {
+            if let Some(scorer) = self.phrase_prefix_scorer::<true>(reader, boost)? {
+                return Ok(Box::new(scorer));
+            }
        } else {
-            Ok(Box::new(EmptyScorer))
+            if let Some(scorer) = self.phrase_prefix_scorer::<false>(reader, boost)? {
+                return Ok(Box::new(scorer));
+            }
        }
+        Ok(Box::new(EmptyScorer))
    }

    fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
-        let scorer_opt = self.phrase_scorer(reader, 1.0)?;
+        let scorer_opt = self.phrase_prefix_scorer::<true>(reader, 1.0)?;
        if scorer_opt.is_none() {
            return Err(does_not_match(doc));
        }
@@ -200,7 +205,7 @@ mod tests {
            .unwrap()
            .unwrap();
        let mut phrase_scorer = phrase_weight
-            .phrase_scorer(searcher.segment_reader(0u32), 1.0)?
+            .phrase_prefix_scorer::<true>(searcher.segment_reader(0u32), 1.0)?
            .unwrap();
        assert_eq!(phrase_scorer.doc(), 1);
        assert_eq!(phrase_scorer.phrase_count(), 2);
@@ -211,6 +216,38 @@ mod tests {
        Ok(())
    }

+    #[test]
+    pub fn test_phrase_no_count() -> crate::Result<()> {
+        let index = create_index(&[
+            "aa bb dd cc",
+            "aa aa bb c dd aa bb cc aa bb dc",
+            " aa bb cd",
+        ])?;
+        let schema = index.schema();
+        let text_field = schema.get_field("text").unwrap();
+        let searcher = index.reader()?.searcher();
+        let phrase_query = PhrasePrefixQuery::new(vec![
+            Term::from_field_text(text_field, "aa"),
+            Term::from_field_text(text_field, "bb"),
+            Term::from_field_text(text_field, "c"),
+        ]);
+        let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
+        let phrase_weight = phrase_query
+            .phrase_prefix_query_weight(enable_scoring)
+            .unwrap()
+            .unwrap();
+        let mut phrase_scorer = phrase_weight
+            .phrase_prefix_scorer::<false>(searcher.segment_reader(0u32), 1.0)?
+            .unwrap();
+        assert_eq!(phrase_scorer.doc(), 1);
+        assert_eq!(phrase_scorer.phrase_count(), 0);
+        assert_eq!(phrase_scorer.advance(), 2);
+        assert_eq!(phrase_scorer.doc(), 2);
+        assert_eq!(phrase_scorer.phrase_count(), 0);
+        assert_eq!(phrase_scorer.advance(), TERMINATED);
+        Ok(())
+    }
+
    #[test]
    pub fn test_phrase_count_mid() -> crate::Result<()> {
        let index = create_index(&["aa dd cc", "aa aa bb c dd aa bb cc aa dc", " aa bb cd"])?;
@@ -227,7 +264,7 @@ mod tests {
            .unwrap()
            .unwrap();
        let mut phrase_scorer = phrase_weight
-            .phrase_scorer(searcher.segment_reader(0u32), 1.0)?
+            .phrase_prefix_scorer::<true>(searcher.segment_reader(0u32), 1.0)?
            .unwrap();
        assert_eq!(phrase_scorer.doc(), 1);
        assert_eq!(phrase_scorer.phrase_count(), 2);
--- a/src/query/phrase_query/mod.rs
+++ b/src/query/phrase_query/mod.rs
@@ -3,8 +3,8 @@ mod phrase_scorer;
 mod phrase_weight;

 pub use self::phrase_query::PhraseQuery;
-pub(crate) use self::phrase_scorer::intersection_count;
 pub use self::phrase_scorer::PhraseScorer;
+pub(crate) use self::phrase_scorer::{intersection_count, intersection_exists};
 pub use self::phrase_weight::PhraseWeight;

 #[cfg(test)]
--- a/src/query/phrase_query/phrase_scorer.rs
+++ b/src/query/phrase_query/phrase_scorer.rs
@@ -58,7 +58,7 @@ pub struct PhraseScorer<TPostings: Postings> {
 }

 /// Returns true if and only if the two sorted arrays contain a common element
-fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
+pub(crate) fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
    let mut left_index = 0;
    let mut right_index = 0;
    while left_index < left.len() && right_index < right.len() {