diff --git a/CHANGELOG.md b/CHANGELOG.md index 164da9cb8..7f7b1ee7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,9 @@ Tantivy 0.15.0 - Bugfix consistent tie break handling in facet's topk (@hardikpnsp) #357 - Date field support for range queries (@rihardsk) #516 - Added lz4-flex as the default compression scheme in tantivy (@PSeitz) #1009 -- Renamed a lot of symbols to avoid all uppercasing on acronyms, as per new clippy recommendation. For instance, RAMDireotory -> RamDirectory. (@pmasurel) +- Renamed a lot of symbols to avoid all uppercasing on acronyms, as per new clippy recommendation. For instance, RAMDirectory -> RamDirectory. (@pmasurel) - Simplified positions index format (@fulmicoton) #1022 +- Added support for more-like-this query in tantivy (@evanxg852000) #1011 Tantivy 0.14.0 ========================= @@ -25,6 +26,7 @@ Tantivy 0.14.0 - Simplified the encoding of the skip reader struct. BlockWAND max tf is now encoded over a single byte. (@fulmicoton) - `FilterCollector` now supports all Fast Field value types (@barrotsteindev) - FastField are not all loaded when opening the segment reader. (@fulmicoton) +- Added an API to merge segments, see `tantivy::merge_segments` #1005. (@evanxg852000) This version breaks compatibility and requires users to reindex everything. diff --git a/src/query/bm25.rs b/src/query/bm25.rs index f017e3d4b..c056ba971 100644 --- a/src/query/bm25.rs +++ b/src/query/bm25.rs @@ -9,7 +9,7 @@ use serde::Serialize; const K1: Score = 1.2; const B: Score = 0.75; -fn idf(doc_freq: u64, doc_count: u64) -> Score { +pub(crate) fn idf(doc_freq: u64, doc_count: u64) -> Score { assert!(doc_count >= doc_freq, "{} >= {}", doc_count, doc_freq); let x = ((doc_count - doc_freq) as Score + 0.5) / (doc_freq as Score + 0.5); (1.0 + x).ln() diff --git a/src/query/mlt/mlt.rs b/src/query/mlt/mlt.rs index 19f50d11d..176fcc3fd 100644 --- a/src/query/mlt/mlt.rs +++ b/src/query/mlt/mlt.rs @@ -2,7 +2,7 @@ use std::cmp::Reverse; use std::collections::{BinaryHeap, HashMap}; use crate::{ - query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery}, + query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery, bm25::idf}, schema::{Field, FieldType, FieldValue, IndexRecordOption, Term, Value}, tokenizer::{BoxTokenStream, FacetTokenizer, PreTokenizedStream, Tokenizer}, DocAddress, Result, Searcher, TantivyError, @@ -358,7 +358,7 @@ impl MoreLikeThis { } // compute similarity & score - let idf = self.idf(doc_freq, num_docs); + let idf = idf(doc_freq, num_docs); let score = (*term_frequency as f32) * idf; if let Some(limit) = self.max_query_terms { if score_terms.len() > limit { @@ -383,9 +383,4 @@ impl MoreLikeThis { Ok(score_terms_vec) } - /// Computes the similarity - fn idf(&self, doc_freq: u64, doc_count: u64) -> f32 { - let x = ((doc_count - doc_freq) as f32 + 0.5) / (doc_freq as f32 + 0.5); - (1f32 + x).ln() - } }