reusing idf from bm25 module as it was the same logic

2025-12-26 20:19:57 +00:00 · 2021-05-03 10:05:40 +00:00
parent 712c01aa93
commit d71aa57077
3 changed files with 6 additions and 9 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,8 +8,9 @@ Tantivy 0.15.0
 - Bugfix consistent tie break handling in facet's topk (@hardikpnsp) #357
 - Date field support for range queries (@rihardsk) #516
 - Added lz4-flex as the default compression scheme in tantivy (@PSeitz) #1009
- Renamed a lot of symbols to avoid all uppercasing on acronyms, as per new clippy recommendation. For instance, RAMDireotory -> RamDirectory. (@pmasurel)
+- Renamed a lot of symbols to avoid all uppercasing on acronyms, as per new clippy recommendation. For instance, RAMDirectory -> RamDirectory. (@pmasurel)
 - Simplified positions index format (@fulmicoton) #1022
+- Added support for more-like-this query in tantivy (@evanxg852000) #1011

 Tantivy 0.14.0
 =========================
@@ -25,6 +26,7 @@ Tantivy 0.14.0
 - Simplified the encoding of the skip reader struct. BlockWAND max tf is now encoded over a single byte. (@fulmicoton)
 - `FilterCollector` now supports all Fast Field value types (@barrotsteindev)
 - FastField are not all loaded when opening the segment reader. (@fulmicoton)
+- Added an API to merge segments, see `tantivy::merge_segments` #1005. (@evanxg852000)

 This version breaks compatibility and requires users to reindex everything.

--- a/src/query/bm25.rs
+++ b/src/query/bm25.rs
@@ -9,7 +9,7 @@ use serde::Serialize;
 const K1: Score = 1.2;
 const B: Score = 0.75;

-fn idf(doc_freq: u64, doc_count: u64) -> Score {
+pub(crate) fn idf(doc_freq: u64, doc_count: u64) -> Score {
    assert!(doc_count >= doc_freq, "{} >= {}", doc_count, doc_freq);
    let x = ((doc_count - doc_freq) as Score + 0.5) / (doc_freq as Score + 0.5);
    (1.0 + x).ln()
--- a/src/query/mlt/mlt.rs
+++ b/src/query/mlt/mlt.rs
@@ -2,7 +2,7 @@ use std::cmp::Reverse;
 use std::collections::{BinaryHeap, HashMap};

 use crate::{
-    query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery},
+    query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery, bm25::idf},
    schema::{Field, FieldType, FieldValue, IndexRecordOption, Term, Value},
    tokenizer::{BoxTokenStream, FacetTokenizer, PreTokenizedStream, Tokenizer},
    DocAddress, Result, Searcher, TantivyError,
@@ -358,7 +358,7 @@ impl MoreLikeThis {
            }

            // compute similarity & score
-            let idf = self.idf(doc_freq, num_docs);
+            let idf = idf(doc_freq, num_docs);
            let score = (*term_frequency as f32) * idf;
            if let Some(limit) = self.max_query_terms {
                if score_terms.len() > limit {
@@ -383,9 +383,4 @@ impl MoreLikeThis {
        Ok(score_terms_vec)
    }

-    /// Computes the similarity
-    fn idf(&self, doc_freq: u64, doc_count: u64) -> f32 {
-        let x = ((doc_count - doc_freq) as f32 + 0.5) / (doc_freq as f32 + 0.5);
-        (1f32 + x).ln()
-    }
 }