Merge branch 'query-boost' of https://github.com/audunhalland/tantivy into audunhalland-query-boost

Make Executor public so Searcher::search_in_executor method now can be used (#769 )
* Make Executor public so Searcher::search_in_executor method now can be used * Fixed cargo fmt
2026-01-09 02:22:54 +00:00 · 2020-01-31 15:51:59 +09:00 · 2020-01-31 15:50:26 +09:00 · 2020-01-30 10:16:56 +09:00 · 2020-01-30 10:14:34 +09:00 · 2020-01-30 10:11:29 +09:00
21 changed files with 238 additions and 71 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,9 @@
 Tantivy 0.12.0
 ======================
 - Removing static dispatch in tokenizers for simplicity. (#762)
+- Added backward iteration for `TermDictionary` stream. (@halvorboe)
+- Fixed a performance issue when searching for the posting lists of a missing term (@audunhalland)
+- Added a configurable maximum number of docs (10M by default) for a segment to be considered for merge (@hntd187, landed by @halvorboe #713) 

 ## How to update?

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tantivy"
-version = "0.11.3"
+version = "0.12.0"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
 categories = ["database-implementations", "data-structures"]
@@ -18,7 +18,7 @@ byteorder = "1.0"
 crc32fast = "1.2.0"
 once_cell = "1.0"
 regex ={version = "1.3.0", default-features = false, features = ["std"]}
-tantivy-fst = "0.2"
+tantivy-fst = "0.2.1"
 memmap = {version = "0.7", optional=true}
 lz4 = {version="1.20", optional=true}
 snap = {version="0.2"}
@@ -60,7 +60,6 @@ winapi = "0.3"
 rand = "0.7"
 maplit = "1"
 matches = "0.1.8"
-time = "0.1.42"

 [dev-dependencies.fail]
 version = "0.3"
--- a/src/core/executor.rs
+++ b/src/core/executor.rs
@@ -10,7 +10,9 @@ use rayon::{ThreadPool, ThreadPoolBuilder};
 /// API of a dependency, knowing it might conflict with a different version
 /// used by the client. Second, we may stop using rayon in the future.
 pub enum Executor {
+    /// Single thread variant of an Executor
    SingleThread,
+    /// Thread pool variant of an Executor
    ThreadPool(ThreadPool),
 }

@@ -20,7 +22,7 @@ impl Executor {
        Executor::SingleThread
    }

-    // Creates an Executor that dispatches the tasks in a thread pool.
+    /// Creates an Executor that dispatches the tasks in a thread pool.
    pub fn multi_thread(num_threads: usize, prefix: &'static str) -> Result<Executor> {
        let pool = ThreadPoolBuilder::new()
            .num_threads(num_threads)
@@ -29,10 +31,10 @@ impl Executor {
        Ok(Executor::ThreadPool(pool))
    }

-    // Perform a map in the thread pool.
-    //
-    // Regardless of the executor (`SingleThread` or `ThreadPool`), panics in the task
-    // will propagate to the caller.
+    /// Perform a map in the thread pool.
+    ///
+    /// Regardless of the executor (`SingleThread` or `ThreadPool`), panics in the task
+    /// will propagate to the caller.
    pub fn map<
        A: Send,
        R: Send,
--- a/src/core/inverted_index_reader.rs
+++ b/src/core/inverted_index_reader.rs
@@ -60,7 +60,7 @@ impl InvertedIndexReader {
            .get_index_record_option()
            .unwrap_or(IndexRecordOption::Basic);
        InvertedIndexReader {
-            termdict: TermDictionary::empty(&field_type),
+            termdict: TermDictionary::empty(),
            postings_source: ReadOnlySource::empty(),
            positions_source: ReadOnlySource::empty(),
            positions_idx_source: ReadOnlySource::empty(),
--- a/src/fastfield/multivalued/mod.rs
+++ b/src/fastfield/multivalued/mod.rs
@@ -7,9 +7,6 @@ pub use self::writer::MultiValueIntFastFieldWriter;
 #[cfg(test)]
 mod tests {

-    use time;
-
-    use self::time::Duration;
    use crate::collector::TopDocs;
    use crate::query::QueryParser;
    use crate::schema::Cardinality;
@@ -17,6 +14,7 @@ mod tests {
    use crate::schema::IntOptions;
    use crate::schema::Schema;
    use crate::Index;
+    use chrono::Duration;

    #[test]
    fn test_multivalued_u64() {
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -897,7 +897,7 @@ mod tests {
        let index_writer = index.writer(3_000_000).unwrap();
        assert_eq!(
            format!("{:?}", index_writer.get_merge_policy()),
-            "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
+            "LogMergePolicy { min_merge_size: 8, max_merge_size: 10000000, min_layer_size: 10000, \
             level_log_size: 0.75 }"
        );
        let merge_policy = Box::new(NoMergePolicy::default());
--- a/src/indexer/log_merge_policy.rs
+++ b/src/indexer/log_merge_policy.rs
@@ -6,12 +6,14 @@ use std::f64;
 const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
 const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
 const DEFAULT_MIN_MERGE_SIZE: usize = 8;
+const DEFAULT_MAX_MERGE_SIZE: usize = 10_000_000;

 /// `LogMergePolicy` tries tries to merge segments that have a similar number of
 /// documents.
 #[derive(Debug, Clone)]
 pub struct LogMergePolicy {
    min_merge_size: usize,
+    max_merge_size: usize,
    min_layer_size: u32,
    level_log_size: f64,
 }
@@ -26,6 +28,12 @@ impl LogMergePolicy {
        self.min_merge_size = min_merge_size;
    }

+    /// Set the maximum number docs in a segment for it to be considered for
+    /// merging.
+    pub fn set_max_merge_size(&mut self, max_merge_size: usize) {
+        self.max_merge_size = max_merge_size;
+    }
+
    /// Set the minimum segment size under which all segment belong
    /// to the same level.
    pub fn set_min_layer_size(&mut self, min_layer_size: u32) {
@@ -53,6 +61,7 @@ impl MergePolicy for LogMergePolicy {
        let mut size_sorted_tuples = segments
            .iter()
            .map(SegmentMeta::num_docs)
+            .filter(|s| s <= &(self.max_merge_size as u32))
            .enumerate()
            .collect::<Vec<(usize, u32)>>();

@@ -86,6 +95,7 @@ impl Default for LogMergePolicy {
    fn default() -> LogMergePolicy {
        LogMergePolicy {
            min_merge_size: DEFAULT_MIN_MERGE_SIZE,
+            max_merge_size: DEFAULT_MAX_MERGE_SIZE,
            min_layer_size: DEFAULT_MIN_LAYER_SIZE,
            level_log_size: DEFAULT_LEVEL_LOG_SIZE,
        }
@@ -104,6 +114,7 @@ mod tests {
    fn test_merge_policy() -> LogMergePolicy {
        let mut log_merge_policy = LogMergePolicy::default();
        log_merge_policy.set_min_merge_size(3);
+        log_merge_policy.set_max_merge_size(100_000);
        log_merge_policy.set_min_layer_size(2);
        log_merge_policy
    }
@@ -141,11 +152,11 @@ mod tests {
            create_random_segment_meta(10),
            create_random_segment_meta(10),
            create_random_segment_meta(10),
-            create_random_segment_meta(1000),
-            create_random_segment_meta(1000),
-            create_random_segment_meta(1000),
-            create_random_segment_meta(10000),
-            create_random_segment_meta(10000),
+            create_random_segment_meta(1_000),
+            create_random_segment_meta(1_000),
+            create_random_segment_meta(1_000),
+            create_random_segment_meta(10_000),
+            create_random_segment_meta(10_000),
            create_random_segment_meta(10),
            create_random_segment_meta(10),
            create_random_segment_meta(10),
@@ -182,4 +193,19 @@ mod tests {
        let result_list = test_merge_policy().compute_merge_candidates(&test_input);
        assert_eq!(result_list.len(), 1);
    }
+
+    #[test]
+    fn test_large_merge_segments() {
+        let test_input = vec![
+            create_random_segment_meta(1_000_000),
+            create_random_segment_meta(100_001),
+            create_random_segment_meta(100_000),
+            create_random_segment_meta(100_000),
+            create_random_segment_meta(100_000),
+        ];
+        let result_list = test_merge_policy().compute_merge_candidates(&test_input);
+        // Do not include large segments
+        assert_eq!(result_list.len(), 1);
+        assert_eq!(result_list[0].0.len(), 3)
+    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -161,7 +161,7 @@ pub use self::snippet::{Snippet, SnippetGenerator};
 mod docset;
 pub use self::docset::{DocSet, SkipResult};
 pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
-pub use crate::core::SegmentComponent;
+pub use crate::core::{Executor, SegmentComponent};
 pub use crate::core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
 pub use crate::core::{InvertedIndexReader, SegmentReader};
 pub use crate::directory::Directory;
--- a/src/postings/serializer.rs
+++ b/src/postings/serializer.rs
@@ -148,8 +148,7 @@ impl<'a> FieldSerializer<'a> {
            }
            _ => (false, false),
        };
-        let term_dictionary_builder =
-            TermDictionaryBuilder::create(term_dictionary_write, &field_type)?;
+        let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?;
        let postings_serializer =
            PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
        let positions_serializer_opt = if position_enabled {
--- a/src/query/automaton_weight.rs
+++ b/src/query/automaton_weight.rs
@@ -15,6 +15,7 @@ use tantivy_fst::Automaton;
 pub struct AutomatonWeight<A> {
    field: Field,
    automaton: Arc<A>,
+    boost: f32,
 }

 impl<A> AutomatonWeight<A>
@@ -26,9 +27,15 @@ where
        AutomatonWeight {
            field,
            automaton: automaton.into(),
+            boost: 1.0,
        }
    }

+    /// Boost the scorer by the given factor.
+    pub fn boost_by(self, boost: f32) -> Self {
+        Self { boost, ..self }
+    }
+
    fn automaton_stream<'a>(&'a self, term_dict: &'a TermDictionary) -> TermStreamer<'a, &'a A> {
        let automaton: &A = &*self.automaton;
        let term_stream_builder = term_dict.search(automaton);
@@ -58,7 +65,7 @@ where
            }
        }
        let doc_bitset = BitSetDocSet::from(doc_bitset);
-        Ok(Box::new(ConstScorer::new(doc_bitset)))
+        Ok(Box::new(ConstScorer::with_score(doc_bitset, self.boost)))
    }

    fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
--- a/src/query/bm25.rs
+++ b/src/query/bm25.rs
@@ -34,7 +34,7 @@ pub struct BM25Weight {
 }

 impl BM25Weight {
-    pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> BM25Weight {
+    pub fn for_terms(searcher: &Searcher, terms: &[Term], boost: f32) -> BM25Weight {
        assert!(!terms.is_empty(), "BM25 requires at least one term");
        let field = terms[0].field();
        for term in &terms[1..] {
@@ -75,11 +75,11 @@ impl BM25Weight {
                .sum::<f32>();
            idf_explain = Explanation::new("idf", idf);
        }
-        BM25Weight::new(idf_explain, average_fieldnorm)
+        BM25Weight::new(idf_explain, average_fieldnorm, boost)
    }

-    fn new(idf_explain: Explanation, average_fieldnorm: f32) -> BM25Weight {
-        let weight = idf_explain.value() * (1f32 + K1);
+    fn new(idf_explain: Explanation, average_fieldnorm: f32, boost: f32) -> BM25Weight {
+        let weight = idf_explain.value() * (1f32 + K1) * boost;
        BM25Weight {
            idf_explain,
            weight,
--- a/src/query/fuzzy_query.rs
+++ b/src/query/fuzzy_query.rs
@@ -79,6 +79,7 @@ pub struct FuzzyTermQuery {
    transposition_cost_one: bool,
    ///
    prefix: bool,
+    boost: f32,
 }

 impl FuzzyTermQuery {
@@ -89,6 +90,7 @@ impl FuzzyTermQuery {
            distance,
            transposition_cost_one,
            prefix: false,
+            boost: 1.0,
        }
    }

@@ -99,16 +101,22 @@ impl FuzzyTermQuery {
            distance,
            transposition_cost_one,
            prefix: true,
+            boost: 1.0,
        }
    }

+    /// Boost the query score by the given factor.
+    pub fn boost_by(self, boost: f32) -> Self {
+        Self { boost, ..self }
+    }
+
    fn specialized_weight(&self) -> Result<AutomatonWeight<DFA>> {
        // LEV_BUILDER is a HashMap, whose `get` method returns an Option
        match LEV_BUILDER.get(&(self.distance, false)) {
            // Unwrap the option and build the Ok(AutomatonWeight)
            Some(automaton_builder) => {
                let automaton = automaton_builder.build_dfa(self.term.text());
-                Ok(AutomatonWeight::new(self.term.field(), automaton))
+                Ok(AutomatonWeight::new(self.term.field(), automaton).boost_by(self.boost))
            }
            None => Err(InvalidArgument(format!(
                "Levenshtein distance of {} is not allowed. Choose a value in the {:?} range",
--- a/src/query/phrase_query/phrase_query.rs
+++ b/src/query/phrase_query/phrase_query.rs
@@ -27,6 +27,7 @@ use std::collections::BTreeSet;
 pub struct PhraseQuery {
    field: Field,
    phrase_terms: Vec<(usize, Term)>,
+    boost: f32,
 }

 impl PhraseQuery {
@@ -57,9 +58,15 @@ impl PhraseQuery {
        PhraseQuery {
            field,
            phrase_terms: terms,
+            boost: 1.0,
        }
    }

+    /// Boost the query score by the given factor.
+    pub fn boost_by(self, boost: f32) -> Self {
+        Self { boost, ..self }
+    }
+
    /// The `Field` this `PhraseQuery` is targeting.
    pub fn field(&self) -> Field {
        self.field
@@ -97,7 +104,7 @@ impl PhraseQuery {
            )));
        }
        let terms = self.phrase_terms();
-        let bm25_weight = BM25Weight::for_terms(searcher, &terms);
+        let bm25_weight = BM25Weight::for_terms(searcher, &terms, self.boost);
        Ok(PhraseWeight::new(
            self.phrase_terms.clone(),
            bm25_weight,
--- a/src/query/regex_query.rs
+++ b/src/query/regex_query.rs
@@ -54,6 +54,7 @@ use tantivy_fst::Regex;
 pub struct RegexQuery {
    regex: Arc<Regex>,
    field: Field,
+    boost: f32,
 }

 impl RegexQuery {
@@ -69,11 +70,17 @@ impl RegexQuery {
        RegexQuery {
            regex: regex.into(),
            field,
+            boost: 1.0,
        }
    }

+    /// Boost the query score by the given factor.
+    pub fn boost_by(self, boost: f32) -> Self {
+        Self { boost, ..self }
+    }
+
    fn specialized_weight(&self) -> AutomatonWeight<Regex> {
-        AutomatonWeight::new(self.field, self.regex.clone())
+        AutomatonWeight::new(self.field, self.regex.clone()).boost_by(self.boost)
    }
 }

--- a/src/query/scorer.rs
+++ b/src/query/scorer.rs
@@ -56,6 +56,11 @@ impl<TDocSet: DocSet> ConstScorer<TDocSet> {
        }
    }

+    /// Creates a new `ConstScorer` with a custom score value
+    pub fn with_score(docset: TDocSet, score: f32) -> ConstScorer<TDocSet> {
+        ConstScorer { docset, score }
+    }
+
    /// Sets the constant score to a different value.
    pub fn set_score(&mut self, score: Score) {
        self.score = score;
--- a/src/query/term_query/mod.rs
+++ b/src/query/term_query/mod.rs
@@ -45,6 +45,35 @@ mod tests {
        assert_eq!(term_scorer.score(), 0.28768212);
    }

+    #[test]
+    pub fn test_term_query_boost_by() {
+        let mut schema_builder = Schema::builder();
+        let text_field = schema_builder.add_text_field("text", STRING);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        {
+            // writing the segment
+            let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+            {
+                let doc = doc!(text_field => "a");
+                index_writer.add_document(doc);
+            }
+            assert!(index_writer.commit().is_ok());
+        }
+        let searcher = index.reader().unwrap().searcher();
+        let term_query = TermQuery::new(
+            Term::from_field_text(text_field, "a"),
+            IndexRecordOption::Basic,
+        )
+        .boost_by(42.0);
+        let term_weight = term_query.weight(&searcher, true).unwrap();
+        let segment_reader = searcher.segment_reader(0);
+        let mut term_scorer = term_weight.scorer(segment_reader).unwrap();
+        assert!(term_scorer.advance());
+        assert_eq!(term_scorer.doc(), 0);
+        assert_nearly_equals(0.28768212 * 42.0, term_scorer.score());
+    }
+
    #[test]
    pub fn test_term_weight() {
        let mut schema_builder = Schema::builder();
--- a/src/query/term_query/term_query.rs
+++ b/src/query/term_query/term_query.rs
@@ -61,6 +61,7 @@ use std::fmt;
 pub struct TermQuery {
    term: Term,
    index_record_option: IndexRecordOption,
+    boost: f32,
 }

 impl fmt::Debug for TermQuery {
@@ -75,9 +76,15 @@ impl TermQuery {
        TermQuery {
            term,
            index_record_option: segment_postings_options,
+            boost: 1.0,
        }
    }

+    /// Boost the query score by the given factor.
+    pub fn boost_by(self, boost: f32) -> Self {
+        Self { boost, ..self }
+    }
+
    /// The `Term` this query is built out of.
    pub fn term(&self) -> &Term {
        &self.term
@@ -90,7 +97,7 @@ impl TermQuery {
    /// This is useful for optimization purpose.
    pub fn specialized_weight(&self, searcher: &Searcher, scoring_enabled: bool) -> TermWeight {
        let term = self.term.clone();
-        let bm25_weight = BM25Weight::for_terms(searcher, &[term]);
+        let bm25_weight = BM25Weight::for_terms(searcher, &[term], self.boost);
        let index_record_option = if scoring_enabled {
            self.index_record_option
        } else {
--- a/src/schema/facet.rs
+++ b/src/schema/facet.rs
@@ -122,6 +122,11 @@ impl Facet {
    pub fn to_path(&self) -> Vec<&str> {
        self.encoded_str().split(|c| c == FACET_SEP_CHAR).collect()
    }
+
+    /// This function is the inverse of Facet::from(&str).
+    pub fn to_path_string(&self) -> String {
+        format!("{}", self.to_string())
+    }
 }

 impl Borrow<str> for Facet {
@@ -265,4 +270,21 @@ mod tests {
        let facet = Facet::from_path(v.iter());
        assert_eq!(facet.to_path(), v);
    }
+
+    #[test]
+    fn test_to_path_string() {
+        let v = ["first", "second", "third/not_fourth"];
+        let facet = Facet::from_path(v.iter());
+        assert_eq!(
+            facet.to_path_string(),
+            String::from("/first/second/third\\/not_fourth")
+        );
+    }
+
+    #[test]
+    fn test_to_path_string_empty() {
+        let v: Vec<&str> = vec![];
+        let facet = Facet::from_path(v.iter());
+        assert_eq!(facet.to_path_string(), "/");
+    }
 }
--- a/src/termdict/mod.rs
+++ b/src/termdict/mod.rs
@@ -38,7 +38,7 @@ mod tests {
    use crate::core::Index;
    use crate::directory::{Directory, RAMDirectory, ReadOnlySource};
    use crate::postings::TermInfo;
-    use crate::schema::{Document, FieldType, Schema, TEXT};
+    use crate::schema::{Document, Schema, TEXT};
    use std::path::PathBuf;
    use std::str;

@@ -52,6 +52,12 @@ mod tests {
        }
    }

+    #[test]
+    fn test_empty_term_dictionary() {
+        let empty = TermDictionary::empty();
+        assert!(empty.stream().next().is_none());
+    }
+
    #[test]
    fn test_term_ordinals() {
        const COUNTRIES: [&'static str; 7] = [
@@ -67,9 +73,7 @@ mod tests {
        let path = PathBuf::from("TermDictionary");
        {
            let write = directory.open_write(&path).unwrap();
-            let field_type = FieldType::Str(TEXT);
-            let mut term_dictionary_builder =
-                TermDictionaryBuilder::create(write, &field_type).unwrap();
+            let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap();
            for term in COUNTRIES.iter() {
                term_dictionary_builder
                    .insert(term.as_bytes(), &make_term_info(0u64))
@@ -93,9 +97,7 @@ mod tests {
        let path = PathBuf::from("TermDictionary");
        {
            let write = directory.open_write(&path).unwrap();
-            let field_type = FieldType::Str(TEXT);
-            let mut term_dictionary_builder =
-                TermDictionaryBuilder::create(write, &field_type).unwrap();
+            let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap();
            term_dictionary_builder
                .insert("abc".as_bytes(), &make_term_info(34u64))
                .unwrap();
@@ -179,10 +181,8 @@ mod tests {
        let ids: Vec<_> = (0u32..10_000u32)
            .map(|i| (format!("doc{:0>6}", i), i))
            .collect();
-        let field_type = FieldType::Str(TEXT);
        let buffer: Vec<u8> = {
-            let mut term_dictionary_builder =
-                TermDictionaryBuilder::create(vec![], &field_type).unwrap();
+            let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
            for &(ref id, ref i) in &ids {
                term_dictionary_builder
                    .insert(id.as_bytes(), &make_term_info(*i as u64))
@@ -209,10 +209,8 @@ mod tests {

    #[test]
    fn test_stream_high_range_prefix_suffix() {
-        let field_type = FieldType::Str(TEXT);
        let buffer: Vec<u8> = {
-            let mut term_dictionary_builder =
-                TermDictionaryBuilder::create(vec![], &field_type).unwrap();
+            let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
            // term requires more than 16bits
            term_dictionary_builder
                .insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
@@ -244,10 +242,8 @@ mod tests {
        let ids: Vec<_> = (0u32..10_000u32)
            .map(|i| (format!("doc{:0>6}", i), i))
            .collect();
-        let field_type = FieldType::Str(TEXT);
        let buffer: Vec<u8> = {
-            let mut term_dictionary_builder =
-                TermDictionaryBuilder::create(vec![], &field_type).unwrap();
+            let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
            for &(ref id, ref i) in &ids {
                term_dictionary_builder
                    .insert(id.as_bytes(), &make_term_info(*i as u64))
@@ -313,10 +309,8 @@ mod tests {

    #[test]
    fn test_empty_string() {
-        let field_type = FieldType::Str(TEXT);
        let buffer: Vec<u8> = {
-            let mut term_dictionary_builder =
-                TermDictionaryBuilder::create(vec![], &field_type).unwrap();
+            let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
            term_dictionary_builder
                .insert(&[], &make_term_info(1 as u64))
                .unwrap();
@@ -337,10 +331,8 @@ mod tests {

    #[test]
    fn test_stream_range_boundaries() {
-        let field_type = FieldType::Str(TEXT);
        let buffer: Vec<u8> = {
-            let mut term_dictionary_builder =
-                TermDictionaryBuilder::create(vec![], &field_type).unwrap();
+            let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
            for i in 0u8..10u8 {
                let number_arr = [i; 1];
                term_dictionary_builder
@@ -352,41 +344,91 @@ mod tests {
        let source = ReadOnlySource::from(buffer);
        let term_dictionary: TermDictionary = TermDictionary::from_source(&source);

-        let value_list = |mut streamer: TermStreamer<'_>| {
+        let value_list = |mut streamer: TermStreamer<'_>, backwards: bool| {
            let mut res: Vec<u32> = vec![];
            while let Some((_, ref v)) = streamer.next() {
                res.push(v.doc_freq);
            }
+            if backwards {
+                res.reverse();
+            }
            res
        };
+        {
+            let range = term_dictionary.range().backward().into_stream();
+            assert_eq!(
+                value_list(range, true),
+                vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
+            );
+        }
        {
            let range = term_dictionary.range().ge([2u8]).into_stream();
            assert_eq!(
-                value_list(range),
+                value_list(range, false),
+                vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
+            );
+        }
+        {
+            let range = term_dictionary.range().ge([2u8]).backward().into_stream();
+            assert_eq!(
+                value_list(range, true),
                vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
            );
        }
        {
            let range = term_dictionary.range().gt([2u8]).into_stream();
            assert_eq!(
-                value_list(range),
+                value_list(range, false),
+                vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
+            );
+        }
+        {
+            let range = term_dictionary.range().gt([2u8]).backward().into_stream();
+            assert_eq!(
+                value_list(range, true),
                vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
            );
        }
        {
            let range = term_dictionary.range().lt([6u8]).into_stream();
-            assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]);
+            assert_eq!(
+                value_list(range, false),
+                vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
+            );
+        }
+        {
+            let range = term_dictionary.range().lt([6u8]).backward().into_stream();
+            assert_eq!(
+                value_list(range, true),
+                vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
+            );
        }
        {
            let range = term_dictionary.range().le([6u8]).into_stream();
            assert_eq!(
-                value_list(range),
+                value_list(range, false),
+                vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
+            );
+        }
+        {
+            let range = term_dictionary.range().le([6u8]).backward().into_stream();
+            assert_eq!(
+                value_list(range, true),
                vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
            );
        }
        {
            let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream();
-            assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
+            assert_eq!(value_list(range, false), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
+        }
+        {
+            let range = term_dictionary
+                .range()
+                .ge([0u8])
+                .lt([5u8])
+                .backward()
+                .into_stream();
+            assert_eq!(value_list(range, true), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
        }
    }

@@ -408,9 +450,7 @@ mod tests {
        let path = PathBuf::from("TermDictionary");
        {
            let write = directory.open_write(&path).unwrap();
-            let field_type = FieldType::Str(TEXT);
-            let mut term_dictionary_builder =
-                TermDictionaryBuilder::create(write, &field_type).unwrap();
+            let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap();
            for term in COUNTRIES.iter() {
                term_dictionary_builder
                    .insert(term.as_bytes(), &make_term_info(0u64))
--- a/src/termdict/streamer.rs
+++ b/src/termdict/streamer.rs
@@ -51,6 +51,12 @@ where
        self
    }

+    /// Iterate over the range backwards.
+    pub fn backward(mut self) -> Self {
+        self.stream_builder = self.stream_builder.backward();
+        self
+    }
+
    /// Creates the stream corresponding to the range
    /// of terms defined using the `TermStreamerBuilder`.
    pub fn into_stream(self) -> TermStreamer<'a, A> {
--- a/src/termdict/termdict.rs
+++ b/src/termdict/termdict.rs
@@ -4,8 +4,8 @@ use crate::common::BinarySerializable;
 use crate::common::CountingWriter;
 use crate::directory::ReadOnlySource;
 use crate::postings::TermInfo;
-use crate::schema::FieldType;
 use crate::termdict::TermOrdinal;
+use once_cell::sync::Lazy;
 use std::io::{self, Write};
 use tantivy_fst;
 use tantivy_fst::raw::Fst;
@@ -29,7 +29,7 @@ where
    W: Write,
 {
    /// Creates a new `TermDictionaryBuilder`
-    pub fn create(w: W, _field_type: &FieldType) -> io::Result<Self> {
+    pub fn create(w: W) -> io::Result<Self> {
        let fst_builder = tantivy_fst::MapBuilder::new(w).map_err(convert_fst_error)?;
        Ok(TermDictionaryBuilder {
            fst_builder,
@@ -92,6 +92,14 @@ fn open_fst_index(source: ReadOnlySource) -> tantivy_fst::Map<ReadOnlySource> {
    tantivy_fst::Map::from(fst)
 }

+static EMPTY_DATA_SOURCE: Lazy<ReadOnlySource> = Lazy::new(|| {
+    let term_dictionary_data: Vec<u8> = TermDictionaryBuilder::create(Vec::<u8>::new())
+        .expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
+        .finish()
+        .expect("Writing in a Vec<u8> should never fail");
+    ReadOnlySource::from(term_dictionary_data)
+});
+
 /// The term dictionary contains all of the terms in
 /// `tantivy index` in a sorted manner.
 ///
@@ -122,14 +130,8 @@ impl TermDictionary {
    }

    /// Creates an empty term dictionary which contains no terms.
-    pub fn empty(field_type: &FieldType) -> Self {
-        let term_dictionary_data: Vec<u8> =
-            TermDictionaryBuilder::create(Vec::<u8>::new(), &field_type)
-                .expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
-                .finish()
-                .expect("Writing in a Vec<u8> should never fail");
-        let source = ReadOnlySource::from(term_dictionary_data);
-        Self::from_source(&source)
+    pub fn empty() -> Self {
+        TermDictionary::from_source(&*EMPTY_DATA_SOURCE)
    }

    /// Returns the number of terms in the dictionary.
Author	SHA1	Message	Date
Paul Masurel	548129fc6d	Merge branch 'query-boost' of https://github.com/audunhalland/tantivy into audunhalland-query-boost	2020-01-31 15:51:59 +09:00
Alexander	55f5658d40	Make Executor public so Searcher::search_in_executor method now can be used (#769 ) * Make Executor public so Searcher::search_in_executor method now can be used * Fixed cargo fmt	2020-01-31 15:50:26 +09:00
Paul Masurel	3ae6363462	Updated CHANGELOG	2020-01-30 10:16:56 +09:00
Halvor Fladsrud Bø	9e20d7f8a5	Maximum size of segment to be considered for merge (#765 ) * Replicated changes from dead PR * Ran formatter.	2020-01-30 10:14:34 +09:00
Halvor Fladsrud Bø	ab13ffe377	Facet path string (#759 ) * Added to_path_string * Fixed logic. Found strange behavior with string comparisons. * ran formatter * Fixed test * Fixed format * Fixed comment	2020-01-30 10:11:29 +09:00
Paul Masurel	039138ed50	Added the empty dictionary item in the CHANGELOG	2020-01-30 10:10:34 +09:00
Paul Masurel	6227a0555a	Added unit test for empty dictionaries.	2020-01-30 10:08:27 +09:00
Audun Halland	f85d0a522a	Optimize TermDictionary::empty by precomputed data source (#767 )	2020-01-30 10:04:58 +09:00
Halvor Fladsrud Bø	5795488ba7	Backward iteration for termdict range (#757 ) * Added backwards iteration to termdict * Ran formatter * Updated fst dependency * Updated dependency * Changelog and version * Fixed version * Made it part of 12.0	2020-01-30 09:59:21 +09:00
Paul Masurel	c3045dfb5c	Remove time dev-deps by relying on chrono::Duration reexport.	2020-01-29 23:25:03 +09:00
Audun Halland	9f04f42b64	Add boost_by to FuzzyQuery, RegexQuery	2020-01-21 00:30:34 +01:00
Audun Halland	aeb8ae3ef0	Add TermQuery, PhraseQuery boost_by	2020-01-21 00:11:22 +01:00