mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
Various changes. Need to cherrypick some of them and put them into master
This commit is contained in:
@@ -63,7 +63,7 @@ debug-assertions = false
|
||||
|
||||
|
||||
[features]
|
||||
default = ["simdcompression", "streamdict"]
|
||||
default = ["simdcompression"]
|
||||
simdcompression = ["libc", "cc"]
|
||||
streamdict = []
|
||||
|
||||
|
||||
@@ -58,10 +58,9 @@ mod murmurhash2 {
|
||||
///
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
let table_size_limit: usize = per_thread_memory_budget / 5;
|
||||
let compute_table_size = |num_bits: usize| {
|
||||
let table_size: usize = (1 << num_bits) * mem::size_of::<KeyValue>();
|
||||
table_size * mem::size_of::<KeyValue>()
|
||||
(1 << num_bits) * mem::size_of::<KeyValue>()
|
||||
};
|
||||
let table_num_bits: usize = (1..)
|
||||
.into_iter()
|
||||
|
||||
@@ -105,6 +105,7 @@ impl IndexMerger {
|
||||
})
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||
let fieldnorm_fastfields: Vec<Field> = self.schema
|
||||
.fields()
|
||||
@@ -120,6 +121,7 @@ impl IndexMerger {
|
||||
)
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||
let fast_fields: Vec<Field> = self.schema
|
||||
.fields()
|
||||
@@ -198,6 +200,7 @@ impl IndexMerger {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
|
||||
let mut delta_computer = DeltaComputer::new();
|
||||
|
||||
@@ -332,6 +335,7 @@ impl IndexMerger {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> Result<()> {
|
||||
for reader in &self.readers {
|
||||
let store_reader = reader.get_store_reader();
|
||||
|
||||
@@ -56,6 +56,7 @@ impl SegmentSerializer {
|
||||
}
|
||||
|
||||
/// Finalize the segment serialization.
|
||||
#[inline(never)]
|
||||
pub fn close(self) -> Result<()> {
|
||||
self.fast_field_serializer.close()?;
|
||||
self.postings_serializer.close()?;
|
||||
|
||||
65
src/tokenizer/alphanum_only.rs
Normal file
65
src/tokenizer/alphanum_only.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AlphaNumOnlyFilter;
|
||||
|
||||
|
||||
pub struct AlphaNumOnlyFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
|
||||
|
||||
impl<TailTokenStream> AlphaNumOnlyFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
token.text.chars().all(|c| c.is_ascii_alphanumeric())
|
||||
}
|
||||
|
||||
fn wrap(
|
||||
tail: TailTokenStream,
|
||||
) -> AlphaNumOnlyFilterStream<TailTokenStream> {
|
||||
AlphaNumOnlyFilterStream {
|
||||
tail
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for AlphaNumOnlyFilter
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
type ResultTokenStream = AlphaNumOnlyFilterStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
AlphaNumOnlyFilterStream::wrap(token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenStream for AlphaNumOnlyFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream
|
||||
{
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
loop {
|
||||
if self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -137,7 +137,10 @@ mod tokenizer_manager;
|
||||
mod japanese_tokenizer;
|
||||
mod token_stream_chain;
|
||||
mod raw_tokenizer;
|
||||
mod alphanum_only;
|
||||
|
||||
|
||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
pub use self::tokenizer::BoxedTokenizer;
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
|
||||
@@ -7,6 +7,7 @@ use tokenizer::RawTokenizer;
|
||||
use tokenizer::SimpleTokenizer;
|
||||
use tokenizer::JapaneseTokenizer;
|
||||
use tokenizer::RemoveLongFilter;
|
||||
use tokenizer::AlphaNumOnlyFilter;
|
||||
use tokenizer::LowerCaser;
|
||||
use tokenizer::Stemmer;
|
||||
|
||||
@@ -70,6 +71,7 @@ impl Default for TokenizerManager {
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(AlphaNumOnlyFilter)
|
||||
.filter(Stemmer::new()),
|
||||
);
|
||||
manager.register("ja", JapaneseTokenizer.filter(RemoveLongFilter::limit(40)));
|
||||
|
||||
Reference in New Issue
Block a user