Merge branch 'master' of github.com:tantivy-search/tantivy into stamper_refactor

This commit is contained in:
petr-tik
2019-04-26 08:47:12 +01:00
10 changed files with 4104 additions and 44 deletions

View File

@@ -1,6 +1,7 @@
Tantivy 0.10.0
====================
=====================
- Added an ASCII folding filter (@drusellers)
- Bugfix in `query.count` in presence of deletes (@pmasurel)
Minor
---------

View File

@@ -214,6 +214,13 @@ pub trait PostingsWriter {
if token.text.len() <= MAX_TOKEN_LEN {
term.set_text(token.text.as_str());
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
} else {
info!(
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
MAX_TOKEN_LEN in the documentation for more information.",
token.text.len(),
MAX_TOKEN_LEN
);
}
};
token_stream.process(&mut sink)

File diff suppressed because it is too large Load Diff

View File

@@ -44,18 +44,17 @@ where
}
fn advance(&mut self) -> bool {
if self.tail.advance() {
if self.token_mut().text.is_ascii() {
// fast track for ascii.
self.token_mut().text.make_ascii_lowercase();
} else {
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
}
true
} else {
false
if !self.tail.advance() {
return false;
}
if self.token_mut().text.is_ascii() {
// fast track for ascii.
self.token_mut().text.make_ascii_lowercase();
} else {
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
}
true
}
}

View File

@@ -131,6 +131,7 @@
//! ```
//!
mod alphanum_only;
mod ascii_folding_filter;
mod facet_tokenizer;
mod lower_caser;
mod ngram_tokenizer;
@@ -144,6 +145,7 @@ mod tokenizer;
mod tokenizer_manager;
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::ascii_folding_filter::AsciiFoldingFilter;
pub use self::facet_tokenizer::FacetTokenizer;
pub use self::lower_caser::LowerCaser;
pub use self::ngram_tokenizer::NgramTokenizer;

View File

@@ -29,12 +29,9 @@ impl<'a> Tokenizer<'a> for RawTokenizer {
impl TokenStream for RawTokenStream {
fn advance(&mut self) -> bool {
if self.has_token {
self.has_token = false;
true
} else {
false
}
let result = self.has_token;
self.has_token = false;
result
}
fn token(&self) -> &Token {

View File

@@ -91,7 +91,6 @@ where
return true;
}
}
false
}
}

View File

@@ -38,23 +38,16 @@ impl<'a> TokenStream for SimpleTokenStream<'a> {
fn advance(&mut self) -> bool {
self.token.text.clear();
self.token.position = self.token.position.wrapping_add(1);
loop {
match self.chars.next() {
Some((offset_from, c)) => {
if c.is_alphanumeric() {
let offset_to = self.search_token_end();
self.token.offset_from = offset_from;
self.token.offset_to = offset_to;
self.token.text.push_str(&self.text[offset_from..offset_to]);
return true;
}
}
None => {
return false;
}
while let Some((offset_from, c)) = self.chars.next() {
if c.is_alphanumeric() {
let offset_to = self.search_token_end();
self.token.offset_from = offset_from;
self.token.offset_to = offset_to;
self.token.text.push_str(&self.text[offset_from..offset_to]);
return true;
}
}
false
}
fn token(&self) -> &Token {

View File

@@ -108,15 +108,14 @@ where
}
fn advance(&mut self) -> bool {
if self.tail.advance() {
// TODO remove allocation
let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
self.token_mut().text.clear();
self.token_mut().text.push_str(&stemmed_str);
true
} else {
false
if !self.tail.advance() {
return false;
}
// TODO remove allocation
let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
self.token_mut().text.clear();
self.token_mut().text.push_str(&stemmed_str);
true
}
}

View File

@@ -104,7 +104,6 @@ where
return true;
}
}
false
}
}