reusing idf from bm25 module as it was the same logic

This commit is contained in:
Evance Souamoro
2021-05-03 10:05:40 +00:00
parent 712c01aa93
commit d71aa57077
3 changed files with 6 additions and 9 deletions

View File

@@ -8,8 +8,9 @@ Tantivy 0.15.0
- Bugfix consistent tie break handling in facet's topk (@hardikpnsp) #357
- Date field support for range queries (@rihardsk) #516
- Added lz4-flex as the default compression scheme in tantivy (@PSeitz) #1009
- Renamed a lot of symbols to avoid all uppercasing on acronyms, as per new clippy recommendation. For instance, RAMDireotory -> RamDirectory. (@pmasurel)
- Renamed a lot of symbols to avoid all uppercasing on acronyms, as per new clippy recommendation. For instance, RAMDirectory -> RamDirectory. (@pmasurel)
- Simplified positions index format (@fulmicoton) #1022
- Added support for more-like-this query in tantivy (@evanxg852000) #1011
Tantivy 0.14.0
=========================
@@ -25,6 +26,7 @@ Tantivy 0.14.0
- Simplified the encoding of the skip reader struct. BlockWAND max tf is now encoded over a single byte. (@fulmicoton)
- `FilterCollector` now supports all Fast Field value types (@barrotsteindev)
- FastField are not all loaded when opening the segment reader. (@fulmicoton)
- Added an API to merge segments, see `tantivy::merge_segments` #1005. (@evanxg852000)
This version breaks compatibility and requires users to reindex everything.

View File

@@ -9,7 +9,7 @@ use serde::Serialize;
const K1: Score = 1.2;
const B: Score = 0.75;
fn idf(doc_freq: u64, doc_count: u64) -> Score {
pub(crate) fn idf(doc_freq: u64, doc_count: u64) -> Score {
assert!(doc_count >= doc_freq, "{} >= {}", doc_count, doc_freq);
let x = ((doc_count - doc_freq) as Score + 0.5) / (doc_freq as Score + 0.5);
(1.0 + x).ln()

View File

@@ -2,7 +2,7 @@ use std::cmp::Reverse;
use std::collections::{BinaryHeap, HashMap};
use crate::{
query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery},
query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery, bm25::idf},
schema::{Field, FieldType, FieldValue, IndexRecordOption, Term, Value},
tokenizer::{BoxTokenStream, FacetTokenizer, PreTokenizedStream, Tokenizer},
DocAddress, Result, Searcher, TantivyError,
@@ -358,7 +358,7 @@ impl MoreLikeThis {
}
// compute similarity & score
let idf = self.idf(doc_freq, num_docs);
let idf = idf(doc_freq, num_docs);
let score = (*term_frequency as f32) * idf;
if let Some(limit) = self.max_query_terms {
if score_terms.len() > limit {
@@ -383,9 +383,4 @@ impl MoreLikeThis {
Ok(score_terms_vec)
}
/// Computes the similarity
fn idf(&self, doc_freq: u64, doc_count: u64) -> f32 {
let x = ((doc_count - doc_freq) as f32 + 0.5) / (doc_freq as f32 + 0.5);
(1f32 + x).ln()
}
}