From f85d0a522af63f7dd0b1511a344bc5db6df88638 Mon Sep 17 00:00:00 2001 From: Audun Halland Date: Thu, 30 Jan 2020 02:04:58 +0100 Subject: [PATCH] Optimize TermDictionary::empty by precomputed data source (#767) --- src/core/inverted_index_reader.rs | 2 +- src/postings/serializer.rs | 3 +-- src/termdict/mod.rs | 34 ++++++++----------------------- src/termdict/termdict.rs | 22 +++++++++++--------- 4 files changed, 23 insertions(+), 38 deletions(-) diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index a13020839..31d234644 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -60,7 +60,7 @@ impl InvertedIndexReader { .get_index_record_option() .unwrap_or(IndexRecordOption::Basic); InvertedIndexReader { - termdict: TermDictionary::empty(&field_type), + termdict: TermDictionary::empty(), postings_source: ReadOnlySource::empty(), positions_source: ReadOnlySource::empty(), positions_idx_source: ReadOnlySource::empty(), diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 79e362a62..79e436192 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -148,8 +148,7 @@ impl<'a> FieldSerializer<'a> { } _ => (false, false), }; - let term_dictionary_builder = - TermDictionaryBuilder::create(term_dictionary_write, &field_type)?; + let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?; let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled); let positions_serializer_opt = if position_enabled { diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 16eb85b95..dcf03ee77 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -38,7 +38,7 @@ mod tests { use crate::core::Index; use crate::directory::{Directory, RAMDirectory, ReadOnlySource}; use crate::postings::TermInfo; - use crate::schema::{Document, FieldType, Schema, TEXT}; + use crate::schema::{Document, Schema, TEXT}; use std::path::PathBuf; use std::str; @@ -67,9 +67,7 @@ mod tests { let path = PathBuf::from("TermDictionary"); { let write = directory.open_write(&path).unwrap(); - let field_type = FieldType::Str(TEXT); - let mut term_dictionary_builder = - TermDictionaryBuilder::create(write, &field_type).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap(); for term in COUNTRIES.iter() { term_dictionary_builder .insert(term.as_bytes(), &make_term_info(0u64)) @@ -93,9 +91,7 @@ mod tests { let path = PathBuf::from("TermDictionary"); { let write = directory.open_write(&path).unwrap(); - let field_type = FieldType::Str(TEXT); - let mut term_dictionary_builder = - TermDictionaryBuilder::create(write, &field_type).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap(); term_dictionary_builder .insert("abc".as_bytes(), &make_term_info(34u64)) .unwrap(); @@ -179,10 +175,8 @@ mod tests { let ids: Vec<_> = (0u32..10_000u32) .map(|i| (format!("doc{:0>6}", i), i)) .collect(); - let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = - TermDictionaryBuilder::create(vec![], &field_type).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); for &(ref id, ref i) in &ids { term_dictionary_builder .insert(id.as_bytes(), &make_term_info(*i as u64)) @@ -209,10 +203,8 @@ mod tests { #[test] fn test_stream_high_range_prefix_suffix() { - let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = - TermDictionaryBuilder::create(vec![], &field_type).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); // term requires more than 16bits term_dictionary_builder .insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1)) @@ -244,10 +236,8 @@ mod tests { let ids: Vec<_> = (0u32..10_000u32) .map(|i| (format!("doc{:0>6}", i), i)) .collect(); - let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = - TermDictionaryBuilder::create(vec![], &field_type).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); for &(ref id, ref i) in &ids { term_dictionary_builder .insert(id.as_bytes(), &make_term_info(*i as u64)) @@ -313,10 +303,8 @@ mod tests { #[test] fn test_empty_string() { - let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = - TermDictionaryBuilder::create(vec![], &field_type).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); term_dictionary_builder .insert(&[], &make_term_info(1 as u64)) .unwrap(); @@ -337,10 +325,8 @@ mod tests { #[test] fn test_stream_range_boundaries() { - let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = - TermDictionaryBuilder::create(vec![], &field_type).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); for i in 0u8..10u8 { let number_arr = [i; 1]; term_dictionary_builder @@ -458,9 +444,7 @@ mod tests { let path = PathBuf::from("TermDictionary"); { let write = directory.open_write(&path).unwrap(); - let field_type = FieldType::Str(TEXT); - let mut term_dictionary_builder = - TermDictionaryBuilder::create(write, &field_type).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap(); for term in COUNTRIES.iter() { term_dictionary_builder .insert(term.as_bytes(), &make_term_info(0u64)) diff --git a/src/termdict/termdict.rs b/src/termdict/termdict.rs index 6bd47ee62..13d0b759a 100644 --- a/src/termdict/termdict.rs +++ b/src/termdict/termdict.rs @@ -4,8 +4,8 @@ use crate::common::BinarySerializable; use crate::common::CountingWriter; use crate::directory::ReadOnlySource; use crate::postings::TermInfo; -use crate::schema::FieldType; use crate::termdict::TermOrdinal; +use once_cell::sync::Lazy; use std::io::{self, Write}; use tantivy_fst; use tantivy_fst::raw::Fst; @@ -29,7 +29,7 @@ where W: Write, { /// Creates a new `TermDictionaryBuilder` - pub fn create(w: W, _field_type: &FieldType) -> io::Result { + pub fn create(w: W) -> io::Result { let fst_builder = tantivy_fst::MapBuilder::new(w).map_err(convert_fst_error)?; Ok(TermDictionaryBuilder { fst_builder, @@ -92,6 +92,14 @@ fn open_fst_index(source: ReadOnlySource) -> tantivy_fst::Map { tantivy_fst::Map::from(fst) } +static EMPTY_DATA_SOURCE: Lazy = Lazy::new(|| { + let term_dictionary_data: Vec = TermDictionaryBuilder::create(Vec::::new()) + .expect("Creating a TermDictionaryBuilder in a Vec should never fail") + .finish() + .expect("Writing in a Vec should never fail"); + ReadOnlySource::from(term_dictionary_data) +}); + /// The term dictionary contains all of the terms in /// `tantivy index` in a sorted manner. /// @@ -122,14 +130,8 @@ impl TermDictionary { } /// Creates an empty term dictionary which contains no terms. - pub fn empty(field_type: &FieldType) -> Self { - let term_dictionary_data: Vec = - TermDictionaryBuilder::create(Vec::::new(), &field_type) - .expect("Creating a TermDictionaryBuilder in a Vec should never fail") - .finish() - .expect("Writing in a Vec should never fail"); - let source = ReadOnlySource::from(term_dictionary_data); - Self::from_source(&source) + pub fn empty() -> Self { + TermDictionary::from_source(&*EMPTY_DATA_SOURCE) } /// Returns the number of terms in the dictionary.