Various changes. Need to cherrypick some of them and put them into master

This commit is contained in:
Paul Masurel
2017-12-25 10:35:10 +09:00
parent 23fad88b35
commit 1658be3792
7 changed files with 78 additions and 4 deletions

View File

@@ -63,7 +63,7 @@ debug-assertions = false
[features]
default = ["simdcompression", "streamdict"]
default = ["simdcompression"]
simdcompression = ["libc", "cc"]
streamdict = []

View File

@@ -58,10 +58,9 @@ mod murmurhash2 {
///
/// Returns (the heap size in bytes, the hash table size in number of bits)
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
let table_size_limit: usize = per_thread_memory_budget / 3;
let table_size_limit: usize = per_thread_memory_budget / 5;
let compute_table_size = |num_bits: usize| {
let table_size: usize = (1 << num_bits) * mem::size_of::<KeyValue>();
table_size * mem::size_of::<KeyValue>()
(1 << num_bits) * mem::size_of::<KeyValue>()
};
let table_num_bits: usize = (1..)
.into_iter()

View File

@@ -105,6 +105,7 @@ impl IndexMerger {
})
}
#[inline(never)]
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
let fieldnorm_fastfields: Vec<Field> = self.schema
.fields()
@@ -120,6 +121,7 @@ impl IndexMerger {
)
}
#[inline(never)]
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
let fast_fields: Vec<Field> = self.schema
.fields()
@@ -198,6 +200,7 @@ impl IndexMerger {
Ok(())
}
#[inline(never)]
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
let mut delta_computer = DeltaComputer::new();
@@ -332,6 +335,7 @@ impl IndexMerger {
Ok(())
}
#[inline(never)]
fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> Result<()> {
for reader in &self.readers {
let store_reader = reader.get_store_reader();

View File

@@ -56,6 +56,7 @@ impl SegmentSerializer {
}
/// Finalize the segment serialization.
#[inline(never)]
pub fn close(self) -> Result<()> {
self.fast_field_serializer.close()?;
self.postings_serializer.close()?;

View File

@@ -0,0 +1,65 @@
use super::{Token, TokenFilter, TokenStream};
#[derive(Clone)]
pub struct AlphaNumOnlyFilter;
pub struct AlphaNumOnlyFilterStream<TailTokenStream>
where TailTokenStream: TokenStream
{
tail: TailTokenStream,
}
impl<TailTokenStream> AlphaNumOnlyFilterStream<TailTokenStream>
where TailTokenStream: TokenStream
{
fn predicate(&self, token: &Token) -> bool {
token.text.chars().all(|c| c.is_ascii_alphanumeric())
}
fn wrap(
tail: TailTokenStream,
) -> AlphaNumOnlyFilterStream<TailTokenStream> {
AlphaNumOnlyFilterStream {
tail
}
}
}
impl<TailTokenStream> TokenFilter<TailTokenStream> for AlphaNumOnlyFilter
where
TailTokenStream: TokenStream,
{
type ResultTokenStream = AlphaNumOnlyFilterStream<TailTokenStream>;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
AlphaNumOnlyFilterStream::wrap(token_stream)
}
}
impl<TailTokenStream> TokenStream for AlphaNumOnlyFilterStream<TailTokenStream>
where
TailTokenStream: TokenStream
{
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
fn advance(&mut self) -> bool {
loop {
if self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
} else {
return false;
}
}
}
}

View File

@@ -137,7 +137,10 @@ mod tokenizer_manager;
mod japanese_tokenizer;
mod token_stream_chain;
mod raw_tokenizer;
mod alphanum_only;
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
pub use self::tokenizer::BoxedTokenizer;
pub use self::tokenizer_manager::TokenizerManager;

View File

@@ -7,6 +7,7 @@ use tokenizer::RawTokenizer;
use tokenizer::SimpleTokenizer;
use tokenizer::JapaneseTokenizer;
use tokenizer::RemoveLongFilter;
use tokenizer::AlphaNumOnlyFilter;
use tokenizer::LowerCaser;
use tokenizer::Stemmer;
@@ -70,6 +71,7 @@ impl Default for TokenizerManager {
SimpleTokenizer
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AlphaNumOnlyFilter)
.filter(Stemmer::new()),
);
manager.register("ja", JapaneseTokenizer.filter(RemoveLongFilter::limit(40)));