diff --git a/CHANGELOG.md b/CHANGELOG.md index 22aaa4c8f..07ad10f96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ Tantivy 0.6 - Add &[u8] fastfield for associating arbitrary bytes to each document (@jason-wolfe) (#270) - Completely uncompressed - Internally: One u64 fast field for indexes, one fast field for the bytes themselves. +- Add NGram token support (@drusellers) +- Add Stopword Filter support (@drusellers) Tantivy 0.5.2 =========================== @@ -91,7 +93,7 @@ Tantivy 0.3 Special thanks to @Kodraus @lnicola @Ameobea @manuel-woelker @celaus for their contribution to this release. -Thanks also to everyone in tantivy gitter chat +Thanks also to everyone in tantivy gitter chat for their advise and company :) https://gitter.im/tantivy-search/tantivy @@ -99,9 +101,9 @@ https://gitter.im/tantivy-search/tantivy Warning: -Tantivy 0.3 is NOT backward compatible with tantivy 0.2 +Tantivy 0.3 is NOT backward compatible with tantivy 0.2 code and index format. -You should not expect backward compatibility before +You should not expect backward compatibility before tantivy 1.0. @@ -127,7 +129,7 @@ Thanks to @KodrAus ! (#108) the natural ordering. - Building binary targets for tantivy-cli (Thanks to @KodrAus) - Misc invisible bug fixes, and code cleanup. -- Use +- Use diff --git a/Cargo.toml b/Cargo.toml index aa91d08ef..3e7bdb36c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,6 +42,7 @@ rust-stemmers = "0.1.0" downcast = { version="0.9" } matches = "0.1" bitpacking = "0.4" +fnv = "1.0.6" [target.'cfg(windows)'.dependencies] winapi = "0.2" diff --git a/src/lib.rs b/src/lib.rs index 29e3eb89b..f4946ab51 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -134,6 +134,7 @@ extern crate byteorder; extern crate chan; extern crate combine; extern crate crossbeam; +extern crate fnv; extern crate fst; extern crate futures; extern crate futures_cpupool; diff --git a/src/tokenizer/alphanum_only.rs b/src/tokenizer/alphanum_only.rs index b5c70178c..99d1d5839 100644 --- a/src/tokenizer/alphanum_only.rs +++ b/src/tokenizer/alphanum_only.rs @@ -1,3 +1,28 @@ +//! # Example +//! ``` +//! extern crate tantivy; +//! use tantivy::tokenizer::*; +//! +//! # fn main() { +//! +//! let tokenizer = RawTokenizer +//! .filter(AlphaNumOnlyFilter); +//! +//! let mut stream = tokenizer.token_stream("hello there"); +//! // is none because the raw filter emits one token that +//! // contains a space +//! assert!(stream.next().is_none()); +//! +//! let tokenizer = SimpleTokenizer +//! .filter(AlphaNumOnlyFilter); +//! +//! let mut stream = tokenizer.token_stream("hello there 💣"); +//! assert!(stream.next().is_some()); +//! assert!(stream.next().is_some()); +//! // the "emoji" is dropped because its not an alphanum +//! assert!(stream.next().is_none()); +//! # } +//! ``` use super::{Token, TokenFilter, TokenStream}; /// `TokenFilter` that removes all tokens that contain non @@ -49,14 +74,12 @@ where } fn advance(&mut self) -> bool { - loop { - if self.tail.advance() { - if self.predicate(self.tail.token()) { - return true; - } - } else { - return false; + while self.tail.advance() { + if self.predicate(self.tail.token()) { + return true; } } + + false } } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 9cfb437bd..fd0bfbbde 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -137,6 +137,7 @@ mod raw_tokenizer; mod remove_long; mod simple_tokenizer; mod stemmer; +mod stop_word_filter; mod token_stream_chain; mod tokenizer; mod tokenizer_manager; @@ -150,6 +151,7 @@ pub use self::raw_tokenizer::RawTokenizer; pub use self::remove_long::RemoveLongFilter; pub use self::simple_tokenizer::SimpleTokenizer; pub use self::stemmer::Stemmer; +pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; pub use self::tokenizer::BoxedTokenizer; pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs index a0e76dc44..6d615f848 100644 --- a/src/tokenizer/ngram_tokenizer.rs +++ b/src/tokenizer/ngram_tokenizer.rs @@ -68,7 +68,7 @@ impl NgramTokenizer { } pub struct NgramTokenStream<'a> { text: &'a str, - location: usize, + position: usize, text_length: usize, token: Token, min_gram: usize, @@ -83,7 +83,7 @@ impl<'a> Tokenizer<'a> for NgramTokenizer { fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { NgramTokenStream { text, - location: 0, + position: 0, text_length: text.len(), token: Token::default(), min_gram: self.min_gram, @@ -110,11 +110,11 @@ impl<'a> NgramTokenStream<'a> { self.gram_size = self.min_gram; // and move down the chain of letters - self.location += 1; + self.position += 1; } - let result = if (self.location + self.gram_size) <= self.text_length { - Some((self.location, self.gram_size)) + let result = if (self.position + self.gram_size) <= self.text_length { + Some((self.position, self.gram_size)) } else { None }; diff --git a/src/tokenizer/remove_long.rs b/src/tokenizer/remove_long.rs index 27d3122bc..402c5fb6b 100644 --- a/src/tokenizer/remove_long.rs +++ b/src/tokenizer/remove_long.rs @@ -1,3 +1,21 @@ +//! # Example +//! ``` +//! extern crate tantivy; +//! use tantivy::tokenizer::*; +//! +//! # fn main() { +//! +//! let tokenizer = SimpleTokenizer +//! .filter(RemoveLongFilter::limit(5)); +//! +//! let mut stream = tokenizer.token_stream("toolong nice"); +//! // because `toolong` is more than 5 characters, it is filtered +//! // out of the token stream. +//! assert_eq!(stream.next().unwrap().text, "nice"); +//! assert!(stream.next().is_none()); +//! # } +//! ``` +//! use super::{Token, TokenFilter, TokenStream}; /// `RemoveLongFilter` removes tokens that are longer @@ -68,14 +86,12 @@ where } fn advance(&mut self) -> bool { - loop { - if self.tail.advance() { - if self.predicate(self.tail.token()) { - return true; - } - } else { - return false; + while self.tail.advance() { + if self.predicate(self.tail.token()) { + return true; } } + + false } } diff --git a/src/tokenizer/stop_word_filter.rs b/src/tokenizer/stop_word_filter.rs new file mode 100644 index 000000000..6e8f08476 --- /dev/null +++ b/src/tokenizer/stop_word_filter.rs @@ -0,0 +1,97 @@ +//! # Example +//! ``` +//! extern crate tantivy; +//! use tantivy::tokenizer::*; +//! +//! # fn main() { +//! let tokenizer = SimpleTokenizer +//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()])); +//! +//! let mut stream = tokenizer.token_stream("the fox is crafty"); +//! assert_eq!(stream.next().unwrap().text, "fox"); +//! assert_eq!(stream.next().unwrap().text, "crafty"); +//! assert!(stream.next().is_none()); +//! # } +//! ``` +use super::{Token, TokenFilter, TokenStream}; +use fnv::FnvHasher; +use std::collections::HashSet; +use std::hash::BuildHasherDefault; + +// configure our hashers for SPEED +type StopWordHasher = BuildHasherDefault; +type StopWordHashSet = HashSet; + +/// `TokenFilter` that removes stop words from a token stream +#[derive(Clone)] +pub struct StopWordFilter { + words: StopWordHashSet, +} + +impl StopWordFilter { + /// Creates a `StopWordFilter` given a list of words to remove + pub fn remove(words: Vec) -> StopWordFilter { + let mut set = StopWordHashSet::default(); + + for word in words { + set.insert(word); + } + + StopWordFilter { words: set } + } +} + +pub struct StopWordFilterStream +where + TailTokenStream: TokenStream, +{ + words: StopWordHashSet, + tail: TailTokenStream, +} + +impl TokenFilter for StopWordFilter +where + TailTokenStream: TokenStream, +{ + type ResultTokenStream = StopWordFilterStream; + + fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { + StopWordFilterStream::wrap(self.words.clone(), token_stream) + } +} + +impl StopWordFilterStream +where + TailTokenStream: TokenStream, +{ + fn predicate(&self, token: &Token) -> bool { + !self.words.contains(&token.text) + } + + fn wrap(words: StopWordHashSet, tail: TailTokenStream) -> StopWordFilterStream { + StopWordFilterStream { words, tail } + } +} + +impl TokenStream for StopWordFilterStream +where + TailTokenStream: TokenStream, +{ + fn token(&self) -> &Token { + self.tail.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() + } + + fn advance(&mut self) -> bool { + while self.tail.advance() { + if self.predicate(self.tail.token()) { + return true; + } + } + + false + } +}