Optimize TermDictionary::empty by precomputed data source (#767)

This commit is contained in:
Audun Halland
2020-01-30 02:04:58 +01:00
committed by GitHub
parent 5795488ba7
commit f85d0a522a
4 changed files with 23 additions and 38 deletions

View File

@@ -60,7 +60,7 @@ impl InvertedIndexReader {
.get_index_record_option()
.unwrap_or(IndexRecordOption::Basic);
InvertedIndexReader {
termdict: TermDictionary::empty(&field_type),
termdict: TermDictionary::empty(),
postings_source: ReadOnlySource::empty(),
positions_source: ReadOnlySource::empty(),
positions_idx_source: ReadOnlySource::empty(),

View File

@@ -148,8 +148,7 @@ impl<'a> FieldSerializer<'a> {
}
_ => (false, false),
};
let term_dictionary_builder =
TermDictionaryBuilder::create(term_dictionary_write, &field_type)?;
let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?;
let postings_serializer =
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
let positions_serializer_opt = if position_enabled {

View File

@@ -38,7 +38,7 @@ mod tests {
use crate::core::Index;
use crate::directory::{Directory, RAMDirectory, ReadOnlySource};
use crate::postings::TermInfo;
use crate::schema::{Document, FieldType, Schema, TEXT};
use crate::schema::{Document, Schema, TEXT};
use std::path::PathBuf;
use std::str;
@@ -67,9 +67,7 @@ mod tests {
let path = PathBuf::from("TermDictionary");
{
let write = directory.open_write(&path).unwrap();
let field_type = FieldType::Str(TEXT);
let mut term_dictionary_builder =
TermDictionaryBuilder::create(write, &field_type).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap();
for term in COUNTRIES.iter() {
term_dictionary_builder
.insert(term.as_bytes(), &make_term_info(0u64))
@@ -93,9 +91,7 @@ mod tests {
let path = PathBuf::from("TermDictionary");
{
let write = directory.open_write(&path).unwrap();
let field_type = FieldType::Str(TEXT);
let mut term_dictionary_builder =
TermDictionaryBuilder::create(write, &field_type).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap();
term_dictionary_builder
.insert("abc".as_bytes(), &make_term_info(34u64))
.unwrap();
@@ -179,10 +175,8 @@ mod tests {
let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i))
.collect();
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
for &(ref id, ref i) in &ids {
term_dictionary_builder
.insert(id.as_bytes(), &make_term_info(*i as u64))
@@ -209,10 +203,8 @@ mod tests {
#[test]
fn test_stream_high_range_prefix_suffix() {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
// term requires more than 16bits
term_dictionary_builder
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
@@ -244,10 +236,8 @@ mod tests {
let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i))
.collect();
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
for &(ref id, ref i) in &ids {
term_dictionary_builder
.insert(id.as_bytes(), &make_term_info(*i as u64))
@@ -313,10 +303,8 @@ mod tests {
#[test]
fn test_empty_string() {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
term_dictionary_builder
.insert(&[], &make_term_info(1 as u64))
.unwrap();
@@ -337,10 +325,8 @@ mod tests {
#[test]
fn test_stream_range_boundaries() {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
for i in 0u8..10u8 {
let number_arr = [i; 1];
term_dictionary_builder
@@ -458,9 +444,7 @@ mod tests {
let path = PathBuf::from("TermDictionary");
{
let write = directory.open_write(&path).unwrap();
let field_type = FieldType::Str(TEXT);
let mut term_dictionary_builder =
TermDictionaryBuilder::create(write, &field_type).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap();
for term in COUNTRIES.iter() {
term_dictionary_builder
.insert(term.as_bytes(), &make_term_info(0u64))

View File

@@ -4,8 +4,8 @@ use crate::common::BinarySerializable;
use crate::common::CountingWriter;
use crate::directory::ReadOnlySource;
use crate::postings::TermInfo;
use crate::schema::FieldType;
use crate::termdict::TermOrdinal;
use once_cell::sync::Lazy;
use std::io::{self, Write};
use tantivy_fst;
use tantivy_fst::raw::Fst;
@@ -29,7 +29,7 @@ where
W: Write,
{
/// Creates a new `TermDictionaryBuilder`
pub fn create(w: W, _field_type: &FieldType) -> io::Result<Self> {
pub fn create(w: W) -> io::Result<Self> {
let fst_builder = tantivy_fst::MapBuilder::new(w).map_err(convert_fst_error)?;
Ok(TermDictionaryBuilder {
fst_builder,
@@ -92,6 +92,14 @@ fn open_fst_index(source: ReadOnlySource) -> tantivy_fst::Map<ReadOnlySource> {
tantivy_fst::Map::from(fst)
}
static EMPTY_DATA_SOURCE: Lazy<ReadOnlySource> = Lazy::new(|| {
let term_dictionary_data: Vec<u8> = TermDictionaryBuilder::create(Vec::<u8>::new())
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
.finish()
.expect("Writing in a Vec<u8> should never fail");
ReadOnlySource::from(term_dictionary_data)
});
/// The term dictionary contains all of the terms in
/// `tantivy index` in a sorted manner.
///
@@ -122,14 +130,8 @@ impl TermDictionary {
}
/// Creates an empty term dictionary which contains no terms.
pub fn empty(field_type: &FieldType) -> Self {
let term_dictionary_data: Vec<u8> =
TermDictionaryBuilder::create(Vec::<u8>::new(), &field_type)
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
.finish()
.expect("Writing in a Vec<u8> should never fail");
let source = ReadOnlySource::from(term_dictionary_data);
Self::from_source(&source)
pub fn empty() -> Self {
TermDictionary::from_source(&*EMPTY_DATA_SOURCE)
}
/// Returns the number of terms in the dictionary.