diff --git a/Cargo.toml b/Cargo.toml index 82c1d1977..f49dd7f4f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -63,7 +63,7 @@ debug-assertions = false [features] -default = ["simdcompression", "streamdict"] +default = ["simdcompression"] simdcompression = ["libc", "cc"] streamdict = [] diff --git a/src/datastruct/stacker/hashmap.rs b/src/datastruct/stacker/hashmap.rs index 52beb57dd..29ad4c728 100644 --- a/src/datastruct/stacker/hashmap.rs +++ b/src/datastruct/stacker/hashmap.rs @@ -58,10 +58,9 @@ mod murmurhash2 { /// /// Returns (the heap size in bytes, the hash table size in number of bits) pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) { - let table_size_limit: usize = per_thread_memory_budget / 3; + let table_size_limit: usize = per_thread_memory_budget / 5; let compute_table_size = |num_bits: usize| { - let table_size: usize = (1 << num_bits) * mem::size_of::(); - table_size * mem::size_of::() + (1 << num_bits) * mem::size_of::() }; let table_num_bits: usize = (1..) .into_iter() diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index b9b8b5635..223aab455 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -105,6 +105,7 @@ impl IndexMerger { }) } + #[inline(never)] fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { let fieldnorm_fastfields: Vec = self.schema .fields() @@ -120,6 +121,7 @@ impl IndexMerger { ) } + #[inline(never)] fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { let fast_fields: Vec = self.schema .fields() @@ -198,6 +200,7 @@ impl IndexMerger { Ok(()) } + #[inline(never)] fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> { let mut delta_computer = DeltaComputer::new(); @@ -332,6 +335,7 @@ impl IndexMerger { Ok(()) } + #[inline(never)] fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> Result<()> { for reader in &self.readers { let store_reader = reader.get_store_reader(); diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index dd836b835..82c931350 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -56,6 +56,7 @@ impl SegmentSerializer { } /// Finalize the segment serialization. + #[inline(never)] pub fn close(self) -> Result<()> { self.fast_field_serializer.close()?; self.postings_serializer.close()?; diff --git a/src/tokenizer/alphanum_only.rs b/src/tokenizer/alphanum_only.rs new file mode 100644 index 000000000..2e009e45d --- /dev/null +++ b/src/tokenizer/alphanum_only.rs @@ -0,0 +1,65 @@ +use super::{Token, TokenFilter, TokenStream}; + +#[derive(Clone)] +pub struct AlphaNumOnlyFilter; + + +pub struct AlphaNumOnlyFilterStream + where TailTokenStream: TokenStream +{ + tail: TailTokenStream, +} + + +impl AlphaNumOnlyFilterStream + where TailTokenStream: TokenStream +{ + fn predicate(&self, token: &Token) -> bool { + token.text.chars().all(|c| c.is_ascii_alphanumeric()) + } + + fn wrap( + tail: TailTokenStream, + ) -> AlphaNumOnlyFilterStream { + AlphaNumOnlyFilterStream { + tail + } + } +} + + +impl TokenFilter for AlphaNumOnlyFilter + where + TailTokenStream: TokenStream, +{ + type ResultTokenStream = AlphaNumOnlyFilterStream; + + fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { + AlphaNumOnlyFilterStream::wrap(token_stream) + } +} + +impl TokenStream for AlphaNumOnlyFilterStream + where + TailTokenStream: TokenStream +{ + fn token(&self) -> &Token { + self.tail.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() + } + + fn advance(&mut self) -> bool { + loop { + if self.tail.advance() { + if self.predicate(self.tail.token()) { + return true; + } + } else { + return false; + } + } + } +} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index cdcd2346e..1cef50db6 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -137,7 +137,10 @@ mod tokenizer_manager; mod japanese_tokenizer; mod token_stream_chain; mod raw_tokenizer; +mod alphanum_only; + +pub use self::alphanum_only::AlphaNumOnlyFilter; pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer::BoxedTokenizer; pub use self::tokenizer_manager::TokenizerManager; diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index 812660d82..06f955db8 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -7,6 +7,7 @@ use tokenizer::RawTokenizer; use tokenizer::SimpleTokenizer; use tokenizer::JapaneseTokenizer; use tokenizer::RemoveLongFilter; +use tokenizer::AlphaNumOnlyFilter; use tokenizer::LowerCaser; use tokenizer::Stemmer; @@ -70,6 +71,7 @@ impl Default for TokenizerManager { SimpleTokenizer .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) + .filter(AlphaNumOnlyFilter) .filter(Stemmer::new()), ); manager.register("ja", JapaneseTokenizer.filter(RemoveLongFilter::limit(40)));